From 3cf9c715bc185f68a89d46464caa6f757726a21a Mon Sep 17 00:00:00 2001 From: Andrew Trieu Date: Thu, 3 Apr 2025 13:41:14 +0300 Subject: [PATCH] Add preprocessing and evaluation for LIAR dataset using DistilBERT --- archives/fnc2.log | 29 ++++++++++++- src/fnc1a.py | 26 ++++++----- src/fnc1b.py | 53 +++++++++++------------ src/fnc2.py | 78 +++++++++++++++++++++++---------- src/fnc4a.py | 38 ++++++++++++++++ src/fnc4b.py | 107 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 264 insertions(+), 67 deletions(-) create mode 100644 src/fnc4a.py create mode 100644 src/fnc4b.py diff --git a/archives/fnc2.log b/archives/fnc2.log index 756a36e..c6822ac 100644 --- a/archives/fnc2.log +++ b/archives/fnc2.log @@ -5,7 +5,7 @@ ๐Ÿช™ Preprocessing text... ๐Ÿ” Training models... -๐Ÿ“Š Logistic Regression Test Performance: +๐Ÿ“Š Logistic Regression FakeNewsCorpus Performance: precision recall f1-score support Reliable 0.84 0.90 0.87 54706 @@ -16,7 +16,7 @@ weighted avg 0.83 0.83 0.83 85290 -๐Ÿ“Š Naรฏve Bayes Test Performance: +๐Ÿ“Š Naรฏve Bayes FakeNewsCorpus Performance: precision recall f1-score support Reliable 0.79 0.92 0.85 54706 @@ -25,3 +25,28 @@ weighted avg 0.83 0.83 0.83 85290 accuracy 0.79 85290 macro avg 0.79 0.74 0.76 85290 weighted avg 0.79 0.79 0.78 85290 + +๐Ÿ“š Loading LIAR dataset... +๐Ÿงฎ Grouping into binary classes... +๐Ÿช™ Preprocessing text... + +๐Ÿ“Š Logistic Regression LIAR Performance: + precision recall f1-score support + + Reliable 0.75 0.79 0.77 926 + Fake 0.32 0.26 0.29 338 + + accuracy 0.65 1264 + macro avg 0.53 0.53 0.53 1264 +weighted avg 0.63 0.65 0.64 1264 + + +๐Ÿ“Š Naรฏve Bayes LIAR Performance: + precision recall f1-score support + + Reliable 0.74 0.98 0.84 926 + Fake 0.55 0.06 0.11 338 + + accuracy 0.74 1264 + macro avg 0.65 0.52 0.48 1264 +weighted avg 0.69 0.74 0.65 1264 \ No newline at end of file diff --git a/src/fnc1a.py b/src/fnc1a.py index 3527d4c..e4c5d55 100644 --- a/src/fnc1a.py +++ b/src/fnc1a.py @@ -32,31 +32,29 @@ except OSError: print_log("๐Ÿ“– spaCy model loaded.") # Paths -csv_path = "../data/news_cleaned_2018_02_13.csv" -parquet_path = "../data/news_cleaned_2018_02_13.parquet" -output_parquet = "../data/processed_fakenews.parquet" -output_csv = "../data/processed_fakenews.csv" +input_path = "../data/news_cleaned_2018_02_13" +output_path = "../data/processed_fakenews" # Convert CSV to Parquet if needed -if os.path.exists(parquet_path): - data_path = parquet_path -elif os.path.exists(csv_path): +if os.path.exists(input_path + ".parquet"): + data_path = input_path + ".parquet" +elif os.path.exists(input_path + ".csv"): print_log("๐Ÿ”„ Converting CSV to Parquet...") chunksize=1e5 pqwriter = None - for i, df in enumerate(pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip", chunksize=chunksize, usecols=["id", "content", "type"])): + for i, df in enumerate(pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip", chunksize=chunksize, usecols=["id", "content", "type"])): table = pa.Table.from_pandas(df) # If it's the first chunk, create a new parquet writer if i == 0: - pqwriter = pq.ParquetWriter(parquet_path, table.schema) + pqwriter = pq.ParquetWriter(input_path + ".parquet", table.schema) pqwriter.write_table(table) if pqwriter: pqwriter.close() print_log("โœ… Conversion complete.") - data_path = parquet_path + data_path = input_path + ".parquet" else: print_log("โŒ Error: No dataset found.") exit() @@ -120,12 +118,12 @@ for batch in parquet_file.iter_batches(batch_size): # Save processed data final_df = pd.concat(processed_chunks, ignore_index=True) -final_df.to_parquet(output_parquet, index=False) -final_df.to_csv(output_csv, index=False) +final_df.to_parquet(output_path + ".parquet", index=False) +final_df.to_csv(output_path + ".csv", index=False) -print_log(f"๐Ÿ’พ Processed data saved to '{output_parquet}' and '{output_csv}'") +print_log(f"๐Ÿ’พ Processed data saved to '{output_path + ".parquet"}' and '{output_path + ".csv"}'") -# Print statistics +# Print statisticsoutput_csv, index=False) total_vocab_before = len(vocab_before) total_vocab_after_stopwords = len(vocab_after_stopwords) total_vocab_after_stemming = len(vocab_after_stemming) diff --git a/src/fnc1b.py b/src/fnc1b.py index 4e8bc4f..6c57a79 100644 --- a/src/fnc1b.py +++ b/src/fnc1b.py @@ -1,13 +1,10 @@ -import random import pandas as pd import os import subprocess -import pyarrow as pa import pyarrow.parquet as pq -parquet_path = "../data/processed_fakenews.parquet" -csv_path = "../data/processed_fakenews.csv" -sample_path = "../data/sampled_fakenews" +input_path = "../data/processed_fakenews" +output_path = "../data/sampled_fakenews" SAMPLE_FRACTION = 0.1 RANDOM_SEED = 42 # For reproducibility @@ -22,57 +19,57 @@ def sample_dataframe(df, total_rows): return df.sample(n=sample_size, random_state=RANDOM_SEED) # Try to load from Parquet first, fall back to CSV if not available -if os.path.exists(parquet_path): - print(f"๐Ÿ” Loading data from Parquet file at '{parquet_path}'") +if os.path.exists(input_path + ".parquet"): + print(f"๐Ÿ” Loading data from Parquet file at '{input_path + ".parquet"}'") try: # Read metadata to get row count without loading entire file - parquet_file = pq.ParquetFile(parquet_path) + parquet_file = pq.ParquetFile(input_path + ".parquet") total_rows = parquet_file.metadata.num_rows print(f"๐Ÿ” Dataset contains {total_rows:,} rows.") # Read and sample the data - df_sample = sample_dataframe(pd.read_parquet(parquet_path), total_rows) + df_sample = sample_dataframe(pd.read_parquet(input_path + ".parquet"), total_rows) except Exception as e: print(f"โŒ Error reading Parquet file: {e}") print("๐Ÿ”„ Falling back to CSV...") - if not os.path.exists(csv_path): - print(f"โŒ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}") + if not os.path.exists(input_path + ".csv"): + print(f"โŒ Error: Neither Parquet nor CSV file found at {input_path + ".parquet"} or {input_path + ".csv"}") exit() # Get total rows from CSV (Unix-like systems only due to `wc`) - total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1 + total_rows = int(subprocess.check_output(["wc", "-l", input_path + ".csv"]).split()[0]) - 1 print(f"๐Ÿ” Dataset contains {total_rows:,} rows.") # Read and sample the data df_sample = sample_dataframe( - pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"), + pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip"), total_rows ) -elif os.path.exists(csv_path): - print(f"๐Ÿ” Parquet file not found, loading from CSV at {csv_path}") +elif os.path.exists(input_path + ".csv"): + print(f"๐Ÿ” Parquet file not found, loading from CSV at {input_path + ".csv"}") # Get total rows from CSV (Unix-like systems only due to `wc`) - total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1 + total_rows = int(subprocess.check_output(["wc", "-l", input_path + ".csv"]).split()[0]) - 1 print(f"๐Ÿ” Dataset contains {total_rows:,} rows.") # Read and sample the data df_sample = sample_dataframe( - pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"), + pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip"), total_rows ) else: - print(f"โŒ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}") + print(f"โŒ Error: Neither Parquet nor CSV file found at {input_path + ".parquet"} or {input_path + ".csv"}") exit() # Verify the sample size print(f"โœ… Sample contains {len(df_sample):,} rows (expected {get_sample_size(total_rows=total_rows):,} rows)") # Save the sample in both formats -df_sample.to_csv(f"{sample_path}.csv", index=False) -df_sample.to_parquet(f"{sample_path}.parquet", index=False) +df_sample.to_csv(f"{output_path}.csv", index=False) +df_sample.to_parquet(f"{output_path}.parquet", index=False) -print(f"๐Ÿ’พ Sample saved to '{sample_path}.csv' and '{sample_path}.parquet'.") +print(f"๐Ÿ’พ Sample saved to '{output_path}.csv' and '{output_path}.parquet'.") # Split to 80/10/10 and save as both CSV and Parquet train_size = int(len(df_sample) * 0.8) @@ -83,13 +80,13 @@ df_train = df_sample.iloc[:train_size] df_valid = df_sample.iloc[train_size:train_size + valid_size] df_test = df_sample.iloc[train_size + valid_size:] -df_train.to_csv(f"{sample_path}_train.csv", index=False) -df_valid.to_csv(f"{sample_path}_valid.csv", index=False) -df_test.to_csv(f"{sample_path}_test.csv", index=False) +df_train.to_csv(f"{output_path}_train.csv", index=False) +df_valid.to_csv(f"{output_path}_valid.csv", index=False) +df_test.to_csv(f"{output_path}_test.csv", index=False) -df_train.to_parquet(f"{sample_path}_train.parquet", index=False) -df_valid.to_parquet(f"{sample_path}_valid.parquet", index=False) -df_test.to_parquet(f"{sample_path}_test.parquet", index=False) +df_train.to_parquet(f"{output_path}_train.parquet", index=False) +df_valid.to_parquet(f"{output_path}_valid.parquet", index=False) +df_test.to_parquet(f"{output_path}_test.parquet", index=False) -print(f"๐Ÿ’พ Train/Valid/Test splits saved to '{sample_path}_train.csv', '{sample_path}_valid.csv', '{sample_path}_test.csv'.") +print(f"๐Ÿ’พ Train/Valid/Test splits saved to '{output_path}_train.csv', '{output_path}_valid.csv', '{output_path}_test.csv'.") diff --git a/src/fnc2.py b/src/fnc2.py index b6bc04b..775dbdb 100644 --- a/src/fnc2.py +++ b/src/fnc2.py @@ -5,6 +5,30 @@ from sklearn.model_selection import GridSearchCV from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import classification_report +input_path = "../data/sampled_fakenews" + +# Function to perform hyperparameter tuning, not used in the final script +def hyperparameter_tuning(): + print("๐Ÿ” Hyperparameter tuning...") + param_grid_lr = { + 'C': [0.1, 1, 10], + 'max_iter': [100, 500, 1000], + 'class_weight': ['balanced', None] + } + grid = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3) + grid.fit(X_train, y_train) + print("โœ… Best Logistic Regression Parameters:", grid.best_params_) + + param_grid_nb = { + 'alpha': [0.1, 0.5, 1.0, 2.0], + 'fit_prior': [True, False] + } + grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3) + grid_nb.fit(X_val, y_val) + print("โœ… Best Naรฏve Bayes Parameters:", grid_nb.best_params_) + +#---FAKENEWSCORPUS DATASET--- + # Load parquet first, fall back to CSV if not available def load_split(file_prefix, split_name): try: @@ -17,9 +41,9 @@ def load_split(file_prefix, split_name): print(f"๐Ÿ”„ Parquet file not found, loading from CSV at '{file_prefix}_{split_name}.csv'") return pd.read_csv(f"{file_prefix}_{split_name}.csv") -train = load_split("../data/sampled_fakenews", "train") -val = load_split("../data/sampled_fakenews", "valid") -test = load_split("../data/sampled_fakenews", "test") +train = load_split(input_path, "train") +val = load_split(input_path, "valid") +test = load_split(input_path, "test") # "Political" and "bias" may not be inherently fake, and "unknown" is neutral print("๐Ÿงฎ Grouping into binary classes...") @@ -46,29 +70,37 @@ nb = MultinomialNB(alpha=0.1, fit_prior=True) nb.fit(X_train, y_train) y_test_pred_lr = lr.predict(X_test) -print("\n๐Ÿ“Š Logistic Regression Test Performance:") +print("\n๐Ÿ“Š Logistic Regression FakeNewsCorpus Performance:") print(classification_report(y_test, y_test_pred_lr, target_names=['Reliable', 'Fake'])) y_test_pred_nb = nb.predict(X_test) -print("\n๐Ÿ“Š Naรฏve Bayes Test Performance:") +print("\n๐Ÿ“Š Naรฏve Bayes FakeNewsCorpus Performance:") print(classification_report(y_test, y_test_pred_nb, target_names=['Reliable', 'Fake'])) -# Function to perform hyperparameter tuning, not used in the final script -def hyperparameter_tuning(): - print("๐Ÿ” Hyperparameter tuning...") - param_grid_lr = { - 'C': [0.1, 1, 10], - 'max_iter': [100, 500, 1000], - 'class_weight': ['balanced', None] - } - grid = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3) - grid.fit(X_train, y_train) - print("โœ… Best Logistic Regression Parameters:", grid.best_params_) +#---LIAR DATASET--- +# Load the tsv file +print("๐Ÿ“š Loading LIAR dataset...") +liar_test = pd.read_csv("../data/liar_test_processed.csv") + +# "Political" and "bias" may not be inherently fake, and "unknown" is neutral +print("๐Ÿงฎ Grouping into binary classes...") +liar_fake_labels = {'false', 'pants-fire'} +liar_test['label'] = liar_test.iloc[:, 1].apply(lambda x: 1 if x in liar_fake_labels else 0) + +# Check for NaN values in processed_text +liar_test = liar_test.dropna(subset=['processed_text']) + +# Transform LIAR text using the same TF-IDF vectorizer +print("๐Ÿช™ Preprocessing text...") +X_liar_test = tfidf.transform(liar_test['processed_text']) + +# Logistic Regression +y_liar_pred_lr = lr.predict(X_liar_test) +print("\n๐Ÿ“Š Logistic Regression LIAR Performance:") +print(classification_report(liar_test['label'], y_liar_pred_lr, target_names=['Reliable', 'Fake'])) + +# Naรฏve Bayes +y_liar_pred_nb = nb.predict(X_liar_test) +print("\n๐Ÿ“Š Naรฏve Bayes LIAR Performance:") +print(classification_report(liar_test['label'], y_liar_pred_nb, target_names=['Reliable', 'Fake'])) - param_grid_nb = { - 'alpha': [0.1, 0.5, 1.0, 2.0], - 'fit_prior': [True, False] - } - grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3) - grid_nb.fit(X_val, y_val) - print("โœ… Best Naรฏve Bayes Parameters:", grid_nb.best_params_) diff --git a/src/fnc4a.py b/src/fnc4a.py new file mode 100644 index 0000000..f25a68f --- /dev/null +++ b/src/fnc4a.py @@ -0,0 +1,38 @@ +import pandas as pd +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer + +input_path = "../data/liar" +output_path = "../data/liar_processed" + +# Initialize preprocessing tools +stemmer = PorterStemmer() +stop_words = set(stopwords.words('english')) + +# Apply the same process as FakeNewsCorpus +def preprocess_text(text): + if not isinstance(text, str): + return "" + + # Tokenization + tokens = [word.lower() for word in text.split() if word.isalpha()] + + # Stopword removal + tokens = [word for word in tokens if word not in stop_words] + + # Stemming + tokens = [stemmer.stem(word) for word in tokens] + + return ' '.join(tokens) + +# Load LIAR dataset +print("๐Ÿ” Loading LIAR dataset...") +liar_test = pd.read_csv(input_path + "_test.tsv", sep='\t', header=None) + +# Apply preprocessing (column 2 contains the text statements) +print("๐Ÿช™ Preprocessing LIAR text...") +liar_test['processed_text'] = liar_test[2].apply(preprocess_text) + +# Save preprocessed data +liar_test.to_csv(output_path + "_test.tsv", index=False) +print(f"๐Ÿ’พ Preprocessed LIAR data saved to '{output_path + "_test.tsv"}'") \ No newline at end of file diff --git a/src/fnc4b.py b/src/fnc4b.py new file mode 100644 index 0000000..b4c1233 --- /dev/null +++ b/src/fnc4b.py @@ -0,0 +1,107 @@ +import torch +from torch.utils.data import Dataset +import pandas as pd +import numpy as np +from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay +from tqdm import tqdm +from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification +import matplotlib.pyplot as plt + +# 1. Load and preprocess LIAR dataset +print("๐Ÿ“š Loading LIAR dataset...") +liar_test = pd.read_csv("../data/liar_test_processed.csv") + +# Binary label mapping (adjust based on your LIAR preprocessing) +print("๐Ÿงฎ Grouping into binary classes...") +liar_fake_labels = {'false', 'pants-fire'} # Update with your actual LIAR labels +liar_test['label'] = liar_test.iloc[:, 1].apply(lambda x: 1 if x in liar_fake_labels else 0) + +# 2. Load model and tokenizer +model_path = "./fake_news_bert" +print(f"โฌ‡๏ธ Loading model from {model_path}...") +tokenizer = DistilBertTokenizerFast.from_pretrained(model_path, do_lower_case=True) +model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=2) + +# 3. Tokenization +print("๐Ÿช™ Tokenizing text...") +def tokenize_data(texts, max_length=512): + results = {'input_ids': [], 'attention_mask': []} + batch_size = 1000 + + for i in tqdm(range(0, len(texts), batch_size, desc="Tokenizing")): + batch = texts[i:i+batch_size] + encoded = tokenizer( + batch, + truncation=True, + padding='max_length', + max_length=max_length, + return_tensors='pt', + return_attention_mask=True, + return_token_type_ids=False + ) + results['input_ids'].append(encoded['input_ids']) + results['attention_mask'].append(encoded['attention_mask']) + + return { + 'input_ids': torch.cat(results['input_ids']), + 'attention_mask': torch.cat(results['attention_mask']) + } + +test_encodings = tokenize_data(liar_test['processed_text'].tolist()) + +# 4. Dataset Class +class CustomDataset(Dataset): + def __init__(self, encodings, labels): + self.encodings = encodings + self.labels = torch.tensor(labels.values, dtype=torch.long) + + def __getitem__(self, idx): + return { + 'input_ids': self.encodings['input_ids'][idx], + 'attention_mask': self.encodings['attention_mask'][idx], + 'labels': self.labels[idx] + } + + def __len__(self): + return len(self.labels) + +print("\n๐Ÿ“ Creating dataset...") +test_dataset = CustomDataset(test_encodings, liar_test['label']) + +# 5. Prediction Function +def predict(model, dataset, batch_size=32): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + model.eval() + + preds = [] + true_labels = [] + + for i in tqdm(range(0, len(dataset), batch_size), desc="Predicting"): + # Get batch + batch_indices = range(i, min(i+batch_size, len(dataset))) + batch = [dataset[j] for j in batch_indices] + + # Prepare inputs + inputs = { + 'input_ids': torch.stack([item['input_ids'] for item in batch]).to(device), + 'attention_mask': torch.stack([item['attention_mask'] for item in batch]).to(device) + } + + # Predict + with torch.no_grad(): + outputs = model(**inputs) + + # Store results + preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy()) + true_labels.extend([item['labels'] for item in batch]) + + return np.array(preds), np.array(true_labels) + +# 6. Run Evaluation +print("\n๐Ÿงช Evaluating on LIAR test set...") +y_pred, y_true = predict(model, test_dataset) + +# 7. Performance Report +print("\n๐Ÿ“Š DistilBERT Performance on LIAR Dataset:") +print(classification_report(y_true, y_pred, target_names=['Reliable', 'Fake']))