Add preprocessing and evaluation for LIAR dataset using DistilBERT

This commit is contained in:
2025-04-03 13:41:14 +03:00
parent 1df0e66bc8
commit 3cf9c715bc
6 changed files with 264 additions and 67 deletions

View File

@@ -5,7 +5,7 @@
🪙 Preprocessing text... 🪙 Preprocessing text...
🔍 Training models... 🔍 Training models...
📊 Logistic Regression Test Performance: 📊 Logistic Regression FakeNewsCorpus Performance:
precision recall f1-score support precision recall f1-score support
Reliable 0.84 0.90 0.87 54706 Reliable 0.84 0.90 0.87 54706
@@ -16,7 +16,7 @@
weighted avg 0.83 0.83 0.83 85290 weighted avg 0.83 0.83 0.83 85290
📊 Naïve Bayes Test Performance: 📊 Naïve Bayes FakeNewsCorpus Performance:
precision recall f1-score support precision recall f1-score support
Reliable 0.79 0.92 0.85 54706 Reliable 0.79 0.92 0.85 54706
@@ -25,3 +25,28 @@ weighted avg 0.83 0.83 0.83 85290
accuracy 0.79 85290 accuracy 0.79 85290
macro avg 0.79 0.74 0.76 85290 macro avg 0.79 0.74 0.76 85290
weighted avg 0.79 0.79 0.78 85290 weighted avg 0.79 0.79 0.78 85290
📚 Loading LIAR dataset...
🧮 Grouping into binary classes...
🪙 Preprocessing text...
📊 Logistic Regression LIAR Performance:
precision recall f1-score support
Reliable 0.75 0.79 0.77 926
Fake 0.32 0.26 0.29 338
accuracy 0.65 1264
macro avg 0.53 0.53 0.53 1264
weighted avg 0.63 0.65 0.64 1264
📊 Naïve Bayes LIAR Performance:
precision recall f1-score support
Reliable 0.74 0.98 0.84 926
Fake 0.55 0.06 0.11 338
accuracy 0.74 1264
macro avg 0.65 0.52 0.48 1264
weighted avg 0.69 0.74 0.65 1264

View File

@@ -32,31 +32,29 @@ except OSError:
print_log("📖 spaCy model loaded.") print_log("📖 spaCy model loaded.")
# Paths # Paths
csv_path = "../data/news_cleaned_2018_02_13.csv" input_path = "../data/news_cleaned_2018_02_13"
parquet_path = "../data/news_cleaned_2018_02_13.parquet" output_path = "../data/processed_fakenews"
output_parquet = "../data/processed_fakenews.parquet"
output_csv = "../data/processed_fakenews.csv"
# Convert CSV to Parquet if needed # Convert CSV to Parquet if needed
if os.path.exists(parquet_path): if os.path.exists(input_path + ".parquet"):
data_path = parquet_path data_path = input_path + ".parquet"
elif os.path.exists(csv_path): elif os.path.exists(input_path + ".csv"):
print_log("🔄 Converting CSV to Parquet...") print_log("🔄 Converting CSV to Parquet...")
chunksize=1e5 chunksize=1e5
pqwriter = None pqwriter = None
for i, df in enumerate(pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip", chunksize=chunksize, usecols=["id", "content", "type"])): for i, df in enumerate(pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip", chunksize=chunksize, usecols=["id", "content", "type"])):
table = pa.Table.from_pandas(df) table = pa.Table.from_pandas(df)
# If it's the first chunk, create a new parquet writer # If it's the first chunk, create a new parquet writer
if i == 0: if i == 0:
pqwriter = pq.ParquetWriter(parquet_path, table.schema) pqwriter = pq.ParquetWriter(input_path + ".parquet", table.schema)
pqwriter.write_table(table) pqwriter.write_table(table)
if pqwriter: if pqwriter:
pqwriter.close() pqwriter.close()
print_log("✅ Conversion complete.") print_log("✅ Conversion complete.")
data_path = parquet_path data_path = input_path + ".parquet"
else: else:
print_log("❌ Error: No dataset found.") print_log("❌ Error: No dataset found.")
exit() exit()
@@ -120,12 +118,12 @@ for batch in parquet_file.iter_batches(batch_size):
# Save processed data # Save processed data
final_df = pd.concat(processed_chunks, ignore_index=True) final_df = pd.concat(processed_chunks, ignore_index=True)
final_df.to_parquet(output_parquet, index=False) final_df.to_parquet(output_path + ".parquet", index=False)
final_df.to_csv(output_csv, index=False) final_df.to_csv(output_path + ".csv", index=False)
print_log(f"💾 Processed data saved to '{output_parquet}' and '{output_csv}'") print_log(f"💾 Processed data saved to '{output_path + ".parquet"}' and '{output_path + ".csv"}'")
# Print statistics # Print statisticsoutput_csv, index=False)
total_vocab_before = len(vocab_before) total_vocab_before = len(vocab_before)
total_vocab_after_stopwords = len(vocab_after_stopwords) total_vocab_after_stopwords = len(vocab_after_stopwords)
total_vocab_after_stemming = len(vocab_after_stemming) total_vocab_after_stemming = len(vocab_after_stemming)

View File

@@ -1,13 +1,10 @@
import random
import pandas as pd import pandas as pd
import os import os
import subprocess import subprocess
import pyarrow as pa
import pyarrow.parquet as pq import pyarrow.parquet as pq
parquet_path = "../data/processed_fakenews.parquet" input_path = "../data/processed_fakenews"
csv_path = "../data/processed_fakenews.csv" output_path = "../data/sampled_fakenews"
sample_path = "../data/sampled_fakenews"
SAMPLE_FRACTION = 0.1 SAMPLE_FRACTION = 0.1
RANDOM_SEED = 42 # For reproducibility RANDOM_SEED = 42 # For reproducibility
@@ -22,57 +19,57 @@ def sample_dataframe(df, total_rows):
return df.sample(n=sample_size, random_state=RANDOM_SEED) return df.sample(n=sample_size, random_state=RANDOM_SEED)
# Try to load from Parquet first, fall back to CSV if not available # Try to load from Parquet first, fall back to CSV if not available
if os.path.exists(parquet_path): if os.path.exists(input_path + ".parquet"):
print(f"🔍 Loading data from Parquet file at '{parquet_path}'") print(f"🔍 Loading data from Parquet file at '{input_path + ".parquet"}'")
try: try:
# Read metadata to get row count without loading entire file # Read metadata to get row count without loading entire file
parquet_file = pq.ParquetFile(parquet_path) parquet_file = pq.ParquetFile(input_path + ".parquet")
total_rows = parquet_file.metadata.num_rows total_rows = parquet_file.metadata.num_rows
print(f"🔍 Dataset contains {total_rows:,} rows.") print(f"🔍 Dataset contains {total_rows:,} rows.")
# Read and sample the data # Read and sample the data
df_sample = sample_dataframe(pd.read_parquet(parquet_path), total_rows) df_sample = sample_dataframe(pd.read_parquet(input_path + ".parquet"), total_rows)
except Exception as e: except Exception as e:
print(f"❌ Error reading Parquet file: {e}") print(f"❌ Error reading Parquet file: {e}")
print("🔄 Falling back to CSV...") print("🔄 Falling back to CSV...")
if not os.path.exists(csv_path): if not os.path.exists(input_path + ".csv"):
print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}") print(f"❌ Error: Neither Parquet nor CSV file found at {input_path + ".parquet"} or {input_path + ".csv"}")
exit() exit()
# Get total rows from CSV (Unix-like systems only due to `wc`) # Get total rows from CSV (Unix-like systems only due to `wc`)
total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1 total_rows = int(subprocess.check_output(["wc", "-l", input_path + ".csv"]).split()[0]) - 1
print(f"🔍 Dataset contains {total_rows:,} rows.") print(f"🔍 Dataset contains {total_rows:,} rows.")
# Read and sample the data # Read and sample the data
df_sample = sample_dataframe( df_sample = sample_dataframe(
pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"), pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip"),
total_rows total_rows
) )
elif os.path.exists(csv_path): elif os.path.exists(input_path + ".csv"):
print(f"🔍 Parquet file not found, loading from CSV at {csv_path}") print(f"🔍 Parquet file not found, loading from CSV at {input_path + ".csv"}")
# Get total rows from CSV (Unix-like systems only due to `wc`) # Get total rows from CSV (Unix-like systems only due to `wc`)
total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1 total_rows = int(subprocess.check_output(["wc", "-l", input_path + ".csv"]).split()[0]) - 1
print(f"🔍 Dataset contains {total_rows:,} rows.") print(f"🔍 Dataset contains {total_rows:,} rows.")
# Read and sample the data # Read and sample the data
df_sample = sample_dataframe( df_sample = sample_dataframe(
pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"), pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip"),
total_rows total_rows
) )
else: else:
print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}") print(f"❌ Error: Neither Parquet nor CSV file found at {input_path + ".parquet"} or {input_path + ".csv"}")
exit() exit()
# Verify the sample size # Verify the sample size
print(f"✅ Sample contains {len(df_sample):,} rows (expected {get_sample_size(total_rows=total_rows):,} rows)") print(f"✅ Sample contains {len(df_sample):,} rows (expected {get_sample_size(total_rows=total_rows):,} rows)")
# Save the sample in both formats # Save the sample in both formats
df_sample.to_csv(f"{sample_path}.csv", index=False) df_sample.to_csv(f"{output_path}.csv", index=False)
df_sample.to_parquet(f"{sample_path}.parquet", index=False) df_sample.to_parquet(f"{output_path}.parquet", index=False)
print(f"💾 Sample saved to '{sample_path}.csv' and '{sample_path}.parquet'.") print(f"💾 Sample saved to '{output_path}.csv' and '{output_path}.parquet'.")
# Split to 80/10/10 and save as both CSV and Parquet # Split to 80/10/10 and save as both CSV and Parquet
train_size = int(len(df_sample) * 0.8) train_size = int(len(df_sample) * 0.8)
@@ -83,13 +80,13 @@ df_train = df_sample.iloc[:train_size]
df_valid = df_sample.iloc[train_size:train_size + valid_size] df_valid = df_sample.iloc[train_size:train_size + valid_size]
df_test = df_sample.iloc[train_size + valid_size:] df_test = df_sample.iloc[train_size + valid_size:]
df_train.to_csv(f"{sample_path}_train.csv", index=False) df_train.to_csv(f"{output_path}_train.csv", index=False)
df_valid.to_csv(f"{sample_path}_valid.csv", index=False) df_valid.to_csv(f"{output_path}_valid.csv", index=False)
df_test.to_csv(f"{sample_path}_test.csv", index=False) df_test.to_csv(f"{output_path}_test.csv", index=False)
df_train.to_parquet(f"{sample_path}_train.parquet", index=False) df_train.to_parquet(f"{output_path}_train.parquet", index=False)
df_valid.to_parquet(f"{sample_path}_valid.parquet", index=False) df_valid.to_parquet(f"{output_path}_valid.parquet", index=False)
df_test.to_parquet(f"{sample_path}_test.parquet", index=False) df_test.to_parquet(f"{output_path}_test.parquet", index=False)
print(f"💾 Train/Valid/Test splits saved to '{sample_path}_train.csv', '{sample_path}_valid.csv', '{sample_path}_test.csv'.") print(f"💾 Train/Valid/Test splits saved to '{output_path}_train.csv', '{output_path}_valid.csv', '{output_path}_test.csv'.")

View File

@@ -5,6 +5,30 @@ from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report from sklearn.metrics import classification_report
input_path = "../data/sampled_fakenews"
# Function to perform hyperparameter tuning, not used in the final script
def hyperparameter_tuning():
print("🔍 Hyperparameter tuning...")
param_grid_lr = {
'C': [0.1, 1, 10],
'max_iter': [100, 500, 1000],
'class_weight': ['balanced', None]
}
grid = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3)
grid.fit(X_train, y_train)
print("✅ Best Logistic Regression Parameters:", grid.best_params_)
param_grid_nb = {
'alpha': [0.1, 0.5, 1.0, 2.0],
'fit_prior': [True, False]
}
grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3)
grid_nb.fit(X_val, y_val)
print("✅ Best Naïve Bayes Parameters:", grid_nb.best_params_)
#---FAKENEWSCORPUS DATASET---
# Load parquet first, fall back to CSV if not available # Load parquet first, fall back to CSV if not available
def load_split(file_prefix, split_name): def load_split(file_prefix, split_name):
try: try:
@@ -17,9 +41,9 @@ def load_split(file_prefix, split_name):
print(f"🔄 Parquet file not found, loading from CSV at '{file_prefix}_{split_name}.csv'") print(f"🔄 Parquet file not found, loading from CSV at '{file_prefix}_{split_name}.csv'")
return pd.read_csv(f"{file_prefix}_{split_name}.csv") return pd.read_csv(f"{file_prefix}_{split_name}.csv")
train = load_split("../data/sampled_fakenews", "train") train = load_split(input_path, "train")
val = load_split("../data/sampled_fakenews", "valid") val = load_split(input_path, "valid")
test = load_split("../data/sampled_fakenews", "test") test = load_split(input_path, "test")
# "Political" and "bias" may not be inherently fake, and "unknown" is neutral # "Political" and "bias" may not be inherently fake, and "unknown" is neutral
print("🧮 Grouping into binary classes...") print("🧮 Grouping into binary classes...")
@@ -46,29 +70,37 @@ nb = MultinomialNB(alpha=0.1, fit_prior=True)
nb.fit(X_train, y_train) nb.fit(X_train, y_train)
y_test_pred_lr = lr.predict(X_test) y_test_pred_lr = lr.predict(X_test)
print("\n📊 Logistic Regression Test Performance:") print("\n📊 Logistic Regression FakeNewsCorpus Performance:")
print(classification_report(y_test, y_test_pred_lr, target_names=['Reliable', 'Fake'])) print(classification_report(y_test, y_test_pred_lr, target_names=['Reliable', 'Fake']))
y_test_pred_nb = nb.predict(X_test) y_test_pred_nb = nb.predict(X_test)
print("\n📊 Naïve Bayes Test Performance:") print("\n📊 Naïve Bayes FakeNewsCorpus Performance:")
print(classification_report(y_test, y_test_pred_nb, target_names=['Reliable', 'Fake'])) print(classification_report(y_test, y_test_pred_nb, target_names=['Reliable', 'Fake']))
# Function to perform hyperparameter tuning, not used in the final script #---LIAR DATASET---
def hyperparameter_tuning(): # Load the tsv file
print("🔍 Hyperparameter tuning...") print("📚 Loading LIAR dataset...")
param_grid_lr = { liar_test = pd.read_csv("../data/liar_test_processed.csv")
'C': [0.1, 1, 10],
'max_iter': [100, 500, 1000], # "Political" and "bias" may not be inherently fake, and "unknown" is neutral
'class_weight': ['balanced', None] print("🧮 Grouping into binary classes...")
} liar_fake_labels = {'false', 'pants-fire'}
grid = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3) liar_test['label'] = liar_test.iloc[:, 1].apply(lambda x: 1 if x in liar_fake_labels else 0)
grid.fit(X_train, y_train)
print("✅ Best Logistic Regression Parameters:", grid.best_params_) # Check for NaN values in processed_text
liar_test = liar_test.dropna(subset=['processed_text'])
# Transform LIAR text using the same TF-IDF vectorizer
print("🪙 Preprocessing text...")
X_liar_test = tfidf.transform(liar_test['processed_text'])
# Logistic Regression
y_liar_pred_lr = lr.predict(X_liar_test)
print("\n📊 Logistic Regression LIAR Performance:")
print(classification_report(liar_test['label'], y_liar_pred_lr, target_names=['Reliable', 'Fake']))
# Naïve Bayes
y_liar_pred_nb = nb.predict(X_liar_test)
print("\n📊 Naïve Bayes LIAR Performance:")
print(classification_report(liar_test['label'], y_liar_pred_nb, target_names=['Reliable', 'Fake']))
param_grid_nb = {
'alpha': [0.1, 0.5, 1.0, 2.0],
'fit_prior': [True, False]
}
grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3)
grid_nb.fit(X_val, y_val)
print("✅ Best Naïve Bayes Parameters:", grid_nb.best_params_)

38
src/fnc4a.py Normal file
View File

@@ -0,0 +1,38 @@
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
input_path = "../data/liar"
output_path = "../data/liar_processed"
# Initialize preprocessing tools
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
# Apply the same process as FakeNewsCorpus
def preprocess_text(text):
if not isinstance(text, str):
return ""
# Tokenization
tokens = [word.lower() for word in text.split() if word.isalpha()]
# Stopword removal
tokens = [word for word in tokens if word not in stop_words]
# Stemming
tokens = [stemmer.stem(word) for word in tokens]
return ' '.join(tokens)
# Load LIAR dataset
print("🔍 Loading LIAR dataset...")
liar_test = pd.read_csv(input_path + "_test.tsv", sep='\t', header=None)
# Apply preprocessing (column 2 contains the text statements)
print("🪙 Preprocessing LIAR text...")
liar_test['processed_text'] = liar_test[2].apply(preprocess_text)
# Save preprocessed data
liar_test.to_csv(output_path + "_test.tsv", index=False)
print(f"💾 Preprocessed LIAR data saved to '{output_path + "_test.tsv"}'")

107
src/fnc4b.py Normal file
View File

@@ -0,0 +1,107 @@
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from tqdm import tqdm
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import matplotlib.pyplot as plt
# 1. Load and preprocess LIAR dataset
print("📚 Loading LIAR dataset...")
liar_test = pd.read_csv("../data/liar_test_processed.csv")
# Binary label mapping (adjust based on your LIAR preprocessing)
print("🧮 Grouping into binary classes...")
liar_fake_labels = {'false', 'pants-fire'} # Update with your actual LIAR labels
liar_test['label'] = liar_test.iloc[:, 1].apply(lambda x: 1 if x in liar_fake_labels else 0)
# 2. Load model and tokenizer
model_path = "./fake_news_bert"
print(f"⬇️ Loading model from {model_path}...")
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path, do_lower_case=True)
model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=2)
# 3. Tokenization
print("🪙 Tokenizing text...")
def tokenize_data(texts, max_length=512):
results = {'input_ids': [], 'attention_mask': []}
batch_size = 1000
for i in tqdm(range(0, len(texts), batch_size, desc="Tokenizing")):
batch = texts[i:i+batch_size]
encoded = tokenizer(
batch,
truncation=True,
padding='max_length',
max_length=max_length,
return_tensors='pt',
return_attention_mask=True,
return_token_type_ids=False
)
results['input_ids'].append(encoded['input_ids'])
results['attention_mask'].append(encoded['attention_mask'])
return {
'input_ids': torch.cat(results['input_ids']),
'attention_mask': torch.cat(results['attention_mask'])
}
test_encodings = tokenize_data(liar_test['processed_text'].tolist())
# 4. Dataset Class
class CustomDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = torch.tensor(labels.values, dtype=torch.long)
def __getitem__(self, idx):
return {
'input_ids': self.encodings['input_ids'][idx],
'attention_mask': self.encodings['attention_mask'][idx],
'labels': self.labels[idx]
}
def __len__(self):
return len(self.labels)
print("\n📝 Creating dataset...")
test_dataset = CustomDataset(test_encodings, liar_test['label'])
# 5. Prediction Function
def predict(model, dataset, batch_size=32):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
preds = []
true_labels = []
for i in tqdm(range(0, len(dataset), batch_size), desc="Predicting"):
# Get batch
batch_indices = range(i, min(i+batch_size, len(dataset)))
batch = [dataset[j] for j in batch_indices]
# Prepare inputs
inputs = {
'input_ids': torch.stack([item['input_ids'] for item in batch]).to(device),
'attention_mask': torch.stack([item['attention_mask'] for item in batch]).to(device)
}
# Predict
with torch.no_grad():
outputs = model(**inputs)
# Store results
preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
true_labels.extend([item['labels'] for item in batch])
return np.array(preds), np.array(true_labels)
# 6. Run Evaluation
print("\n🧪 Evaluating on LIAR test set...")
y_pred, y_true = predict(model, test_dataset)
# 7. Performance Report
print("\n📊 DistilBERT Performance on LIAR Dataset:")
print(classification_report(y_true, y_pred, target_names=['Reliable', 'Fake']))