Add preprocessing and evaluation for LIAR dataset using DistilBERT

This commit is contained in:
2025-04-03 13:41:14 +03:00
parent 1df0e66bc8
commit 3cf9c715bc
6 changed files with 264 additions and 67 deletions

View File

@@ -5,7 +5,7 @@
🪙 Preprocessing text...
🔍 Training models...
📊 Logistic Regression Test Performance:
📊 Logistic Regression FakeNewsCorpus Performance:
precision recall f1-score support
Reliable 0.84 0.90 0.87 54706
@@ -16,7 +16,7 @@
weighted avg 0.83 0.83 0.83 85290
📊 Naïve Bayes Test Performance:
📊 Naïve Bayes FakeNewsCorpus Performance:
precision recall f1-score support
Reliable 0.79 0.92 0.85 54706
@@ -25,3 +25,28 @@ weighted avg 0.83 0.83 0.83 85290
accuracy 0.79 85290
macro avg 0.79 0.74 0.76 85290
weighted avg 0.79 0.79 0.78 85290
📚 Loading LIAR dataset...
🧮 Grouping into binary classes...
🪙 Preprocessing text...
📊 Logistic Regression LIAR Performance:
precision recall f1-score support
Reliable 0.75 0.79 0.77 926
Fake 0.32 0.26 0.29 338
accuracy 0.65 1264
macro avg 0.53 0.53 0.53 1264
weighted avg 0.63 0.65 0.64 1264
📊 Naïve Bayes LIAR Performance:
precision recall f1-score support
Reliable 0.74 0.98 0.84 926
Fake 0.55 0.06 0.11 338
accuracy 0.74 1264
macro avg 0.65 0.52 0.48 1264
weighted avg 0.69 0.74 0.65 1264

View File

@@ -32,31 +32,29 @@ except OSError:
print_log("📖 spaCy model loaded.")
# Paths
csv_path = "../data/news_cleaned_2018_02_13.csv"
parquet_path = "../data/news_cleaned_2018_02_13.parquet"
output_parquet = "../data/processed_fakenews.parquet"
output_csv = "../data/processed_fakenews.csv"
input_path = "../data/news_cleaned_2018_02_13"
output_path = "../data/processed_fakenews"
# Convert CSV to Parquet if needed
if os.path.exists(parquet_path):
data_path = parquet_path
elif os.path.exists(csv_path):
if os.path.exists(input_path + ".parquet"):
data_path = input_path + ".parquet"
elif os.path.exists(input_path + ".csv"):
print_log("🔄 Converting CSV to Parquet...")
chunksize=1e5
pqwriter = None
for i, df in enumerate(pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip", chunksize=chunksize, usecols=["id", "content", "type"])):
for i, df in enumerate(pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip", chunksize=chunksize, usecols=["id", "content", "type"])):
table = pa.Table.from_pandas(df)
# If it's the first chunk, create a new parquet writer
if i == 0:
pqwriter = pq.ParquetWriter(parquet_path, table.schema)
pqwriter = pq.ParquetWriter(input_path + ".parquet", table.schema)
pqwriter.write_table(table)
if pqwriter:
pqwriter.close()
print_log("✅ Conversion complete.")
data_path = parquet_path
data_path = input_path + ".parquet"
else:
print_log("❌ Error: No dataset found.")
exit()
@@ -120,12 +118,12 @@ for batch in parquet_file.iter_batches(batch_size):
# Save processed data
final_df = pd.concat(processed_chunks, ignore_index=True)
final_df.to_parquet(output_parquet, index=False)
final_df.to_csv(output_csv, index=False)
final_df.to_parquet(output_path + ".parquet", index=False)
final_df.to_csv(output_path + ".csv", index=False)
print_log(f"💾 Processed data saved to '{output_parquet}' and '{output_csv}'")
print_log(f"💾 Processed data saved to '{output_path + ".parquet"}' and '{output_path + ".csv"}'")
# Print statistics
# Print statisticsoutput_csv, index=False)
total_vocab_before = len(vocab_before)
total_vocab_after_stopwords = len(vocab_after_stopwords)
total_vocab_after_stemming = len(vocab_after_stemming)

View File

@@ -1,13 +1,10 @@
import random
import pandas as pd
import os
import subprocess
import pyarrow as pa
import pyarrow.parquet as pq
parquet_path = "../data/processed_fakenews.parquet"
csv_path = "../data/processed_fakenews.csv"
sample_path = "../data/sampled_fakenews"
input_path = "../data/processed_fakenews"
output_path = "../data/sampled_fakenews"
SAMPLE_FRACTION = 0.1
RANDOM_SEED = 42 # For reproducibility
@@ -22,57 +19,57 @@ def sample_dataframe(df, total_rows):
return df.sample(n=sample_size, random_state=RANDOM_SEED)
# Try to load from Parquet first, fall back to CSV if not available
if os.path.exists(parquet_path):
print(f"🔍 Loading data from Parquet file at '{parquet_path}'")
if os.path.exists(input_path + ".parquet"):
print(f"🔍 Loading data from Parquet file at '{input_path + ".parquet"}'")
try:
# Read metadata to get row count without loading entire file
parquet_file = pq.ParquetFile(parquet_path)
parquet_file = pq.ParquetFile(input_path + ".parquet")
total_rows = parquet_file.metadata.num_rows
print(f"🔍 Dataset contains {total_rows:,} rows.")
# Read and sample the data
df_sample = sample_dataframe(pd.read_parquet(parquet_path), total_rows)
df_sample = sample_dataframe(pd.read_parquet(input_path + ".parquet"), total_rows)
except Exception as e:
print(f"❌ Error reading Parquet file: {e}")
print("🔄 Falling back to CSV...")
if not os.path.exists(csv_path):
print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}")
if not os.path.exists(input_path + ".csv"):
print(f"❌ Error: Neither Parquet nor CSV file found at {input_path + ".parquet"} or {input_path + ".csv"}")
exit()
# Get total rows from CSV (Unix-like systems only due to `wc`)
total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1
total_rows = int(subprocess.check_output(["wc", "-l", input_path + ".csv"]).split()[0]) - 1
print(f"🔍 Dataset contains {total_rows:,} rows.")
# Read and sample the data
df_sample = sample_dataframe(
pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"),
pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip"),
total_rows
)
elif os.path.exists(csv_path):
print(f"🔍 Parquet file not found, loading from CSV at {csv_path}")
elif os.path.exists(input_path + ".csv"):
print(f"🔍 Parquet file not found, loading from CSV at {input_path + ".csv"}")
# Get total rows from CSV (Unix-like systems only due to `wc`)
total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1
total_rows = int(subprocess.check_output(["wc", "-l", input_path + ".csv"]).split()[0]) - 1
print(f"🔍 Dataset contains {total_rows:,} rows.")
# Read and sample the data
df_sample = sample_dataframe(
pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"),
pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip"),
total_rows
)
else:
print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}")
print(f"❌ Error: Neither Parquet nor CSV file found at {input_path + ".parquet"} or {input_path + ".csv"}")
exit()
# Verify the sample size
print(f"✅ Sample contains {len(df_sample):,} rows (expected {get_sample_size(total_rows=total_rows):,} rows)")
# Save the sample in both formats
df_sample.to_csv(f"{sample_path}.csv", index=False)
df_sample.to_parquet(f"{sample_path}.parquet", index=False)
df_sample.to_csv(f"{output_path}.csv", index=False)
df_sample.to_parquet(f"{output_path}.parquet", index=False)
print(f"💾 Sample saved to '{sample_path}.csv' and '{sample_path}.parquet'.")
print(f"💾 Sample saved to '{output_path}.csv' and '{output_path}.parquet'.")
# Split to 80/10/10 and save as both CSV and Parquet
train_size = int(len(df_sample) * 0.8)
@@ -83,13 +80,13 @@ df_train = df_sample.iloc[:train_size]
df_valid = df_sample.iloc[train_size:train_size + valid_size]
df_test = df_sample.iloc[train_size + valid_size:]
df_train.to_csv(f"{sample_path}_train.csv", index=False)
df_valid.to_csv(f"{sample_path}_valid.csv", index=False)
df_test.to_csv(f"{sample_path}_test.csv", index=False)
df_train.to_csv(f"{output_path}_train.csv", index=False)
df_valid.to_csv(f"{output_path}_valid.csv", index=False)
df_test.to_csv(f"{output_path}_test.csv", index=False)
df_train.to_parquet(f"{sample_path}_train.parquet", index=False)
df_valid.to_parquet(f"{sample_path}_valid.parquet", index=False)
df_test.to_parquet(f"{sample_path}_test.parquet", index=False)
df_train.to_parquet(f"{output_path}_train.parquet", index=False)
df_valid.to_parquet(f"{output_path}_valid.parquet", index=False)
df_test.to_parquet(f"{output_path}_test.parquet", index=False)
print(f"💾 Train/Valid/Test splits saved to '{sample_path}_train.csv', '{sample_path}_valid.csv', '{sample_path}_test.csv'.")
print(f"💾 Train/Valid/Test splits saved to '{output_path}_train.csv', '{output_path}_valid.csv', '{output_path}_test.csv'.")

View File

@@ -5,6 +5,30 @@ from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
input_path = "../data/sampled_fakenews"
# Function to perform hyperparameter tuning, not used in the final script
def hyperparameter_tuning():
print("🔍 Hyperparameter tuning...")
param_grid_lr = {
'C': [0.1, 1, 10],
'max_iter': [100, 500, 1000],
'class_weight': ['balanced', None]
}
grid = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3)
grid.fit(X_train, y_train)
print("✅ Best Logistic Regression Parameters:", grid.best_params_)
param_grid_nb = {
'alpha': [0.1, 0.5, 1.0, 2.0],
'fit_prior': [True, False]
}
grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3)
grid_nb.fit(X_val, y_val)
print("✅ Best Naïve Bayes Parameters:", grid_nb.best_params_)
#---FAKENEWSCORPUS DATASET---
# Load parquet first, fall back to CSV if not available
def load_split(file_prefix, split_name):
try:
@@ -17,9 +41,9 @@ def load_split(file_prefix, split_name):
print(f"🔄 Parquet file not found, loading from CSV at '{file_prefix}_{split_name}.csv'")
return pd.read_csv(f"{file_prefix}_{split_name}.csv")
train = load_split("../data/sampled_fakenews", "train")
val = load_split("../data/sampled_fakenews", "valid")
test = load_split("../data/sampled_fakenews", "test")
train = load_split(input_path, "train")
val = load_split(input_path, "valid")
test = load_split(input_path, "test")
# "Political" and "bias" may not be inherently fake, and "unknown" is neutral
print("🧮 Grouping into binary classes...")
@@ -46,29 +70,37 @@ nb = MultinomialNB(alpha=0.1, fit_prior=True)
nb.fit(X_train, y_train)
y_test_pred_lr = lr.predict(X_test)
print("\n📊 Logistic Regression Test Performance:")
print("\n📊 Logistic Regression FakeNewsCorpus Performance:")
print(classification_report(y_test, y_test_pred_lr, target_names=['Reliable', 'Fake']))
y_test_pred_nb = nb.predict(X_test)
print("\n📊 Naïve Bayes Test Performance:")
print("\n📊 Naïve Bayes FakeNewsCorpus Performance:")
print(classification_report(y_test, y_test_pred_nb, target_names=['Reliable', 'Fake']))
# Function to perform hyperparameter tuning, not used in the final script
def hyperparameter_tuning():
print("🔍 Hyperparameter tuning...")
param_grid_lr = {
'C': [0.1, 1, 10],
'max_iter': [100, 500, 1000],
'class_weight': ['balanced', None]
}
grid = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3)
grid.fit(X_train, y_train)
print("✅ Best Logistic Regression Parameters:", grid.best_params_)
#---LIAR DATASET---
# Load the tsv file
print("📚 Loading LIAR dataset...")
liar_test = pd.read_csv("../data/liar_test_processed.csv")
# "Political" and "bias" may not be inherently fake, and "unknown" is neutral
print("🧮 Grouping into binary classes...")
liar_fake_labels = {'false', 'pants-fire'}
liar_test['label'] = liar_test.iloc[:, 1].apply(lambda x: 1 if x in liar_fake_labels else 0)
# Check for NaN values in processed_text
liar_test = liar_test.dropna(subset=['processed_text'])
# Transform LIAR text using the same TF-IDF vectorizer
print("🪙 Preprocessing text...")
X_liar_test = tfidf.transform(liar_test['processed_text'])
# Logistic Regression
y_liar_pred_lr = lr.predict(X_liar_test)
print("\n📊 Logistic Regression LIAR Performance:")
print(classification_report(liar_test['label'], y_liar_pred_lr, target_names=['Reliable', 'Fake']))
# Naïve Bayes
y_liar_pred_nb = nb.predict(X_liar_test)
print("\n📊 Naïve Bayes LIAR Performance:")
print(classification_report(liar_test['label'], y_liar_pred_nb, target_names=['Reliable', 'Fake']))
param_grid_nb = {
'alpha': [0.1, 0.5, 1.0, 2.0],
'fit_prior': [True, False]
}
grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3)
grid_nb.fit(X_val, y_val)
print("✅ Best Naïve Bayes Parameters:", grid_nb.best_params_)

38
src/fnc4a.py Normal file
View File

@@ -0,0 +1,38 @@
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
input_path = "../data/liar"
output_path = "../data/liar_processed"
# Initialize preprocessing tools
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
# Apply the same process as FakeNewsCorpus
def preprocess_text(text):
if not isinstance(text, str):
return ""
# Tokenization
tokens = [word.lower() for word in text.split() if word.isalpha()]
# Stopword removal
tokens = [word for word in tokens if word not in stop_words]
# Stemming
tokens = [stemmer.stem(word) for word in tokens]
return ' '.join(tokens)
# Load LIAR dataset
print("🔍 Loading LIAR dataset...")
liar_test = pd.read_csv(input_path + "_test.tsv", sep='\t', header=None)
# Apply preprocessing (column 2 contains the text statements)
print("🪙 Preprocessing LIAR text...")
liar_test['processed_text'] = liar_test[2].apply(preprocess_text)
# Save preprocessed data
liar_test.to_csv(output_path + "_test.tsv", index=False)
print(f"💾 Preprocessed LIAR data saved to '{output_path + "_test.tsv"}'")

107
src/fnc4b.py Normal file
View File

@@ -0,0 +1,107 @@
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from tqdm import tqdm
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import matplotlib.pyplot as plt
# 1. Load and preprocess LIAR dataset
print("📚 Loading LIAR dataset...")
liar_test = pd.read_csv("../data/liar_test_processed.csv")
# Binary label mapping (adjust based on your LIAR preprocessing)
print("🧮 Grouping into binary classes...")
liar_fake_labels = {'false', 'pants-fire'} # Update with your actual LIAR labels
liar_test['label'] = liar_test.iloc[:, 1].apply(lambda x: 1 if x in liar_fake_labels else 0)
# 2. Load model and tokenizer
model_path = "./fake_news_bert"
print(f"⬇️ Loading model from {model_path}...")
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path, do_lower_case=True)
model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=2)
# 3. Tokenization
print("🪙 Tokenizing text...")
def tokenize_data(texts, max_length=512):
results = {'input_ids': [], 'attention_mask': []}
batch_size = 1000
for i in tqdm(range(0, len(texts), batch_size, desc="Tokenizing")):
batch = texts[i:i+batch_size]
encoded = tokenizer(
batch,
truncation=True,
padding='max_length',
max_length=max_length,
return_tensors='pt',
return_attention_mask=True,
return_token_type_ids=False
)
results['input_ids'].append(encoded['input_ids'])
results['attention_mask'].append(encoded['attention_mask'])
return {
'input_ids': torch.cat(results['input_ids']),
'attention_mask': torch.cat(results['attention_mask'])
}
test_encodings = tokenize_data(liar_test['processed_text'].tolist())
# 4. Dataset Class
class CustomDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = torch.tensor(labels.values, dtype=torch.long)
def __getitem__(self, idx):
return {
'input_ids': self.encodings['input_ids'][idx],
'attention_mask': self.encodings['attention_mask'][idx],
'labels': self.labels[idx]
}
def __len__(self):
return len(self.labels)
print("\n📝 Creating dataset...")
test_dataset = CustomDataset(test_encodings, liar_test['label'])
# 5. Prediction Function
def predict(model, dataset, batch_size=32):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
preds = []
true_labels = []
for i in tqdm(range(0, len(dataset), batch_size), desc="Predicting"):
# Get batch
batch_indices = range(i, min(i+batch_size, len(dataset)))
batch = [dataset[j] for j in batch_indices]
# Prepare inputs
inputs = {
'input_ids': torch.stack([item['input_ids'] for item in batch]).to(device),
'attention_mask': torch.stack([item['attention_mask'] for item in batch]).to(device)
}
# Predict
with torch.no_grad():
outputs = model(**inputs)
# Store results
preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
true_labels.extend([item['labels'] for item in batch])
return np.array(preds), np.array(true_labels)
# 6. Run Evaluation
print("\n🧪 Evaluating on LIAR test set...")
y_pred, y_true = predict(model, test_dataset)
# 7. Performance Report
print("\n📊 DistilBERT Performance on LIAR Dataset:")
print(classification_report(y_true, y_pred, target_names=['Reliable', 'Fake']))