Add preprocessing and evaluation for LIAR dataset using DistilBERT

2025-04-03 13:41:14 +03:00
parent 1df0e66bc8
commit 3cf9c715bc
6 changed files with 264 additions and 67 deletions
--- a/archives/fnc2.log
+++ b/archives/fnc2.log
@@ -5,7 +5,7 @@
 🪙 Preprocessing text...
 🔍 Training models...
-📊 Logistic Regression Test Performance:
+📊 Logistic Regression FakeNewsCorpus Performance:
              precision    recall  f1-score   support
    Reliable       0.84      0.90      0.87     54706
@@ -16,7 +16,7 @@
 weighted avg       0.83      0.83      0.83     85290
-📊 Naïve Bayes Test Performance:
+📊 Naïve Bayes FakeNewsCorpus Performance:
              precision    recall  f1-score   support
    Reliable       0.79      0.92      0.85     54706
@@ -25,3 +25,28 @@ weighted avg       0.83      0.83      0.83     85290
    accuracy                           0.79     85290
   macro avg       0.79      0.74      0.76     85290
 weighted avg       0.79      0.79      0.78     85290
 📚 Loading LIAR dataset...
 🧮 Grouping into binary classes...
 🪙 Preprocessing text...
 📊 Logistic Regression LIAR Performance:
              precision    recall  f1-score   support
    Reliable       0.75      0.79      0.77       926
        Fake       0.32      0.26      0.29       338
    accuracy                           0.65      1264
   macro avg       0.53      0.53      0.53      1264
 weighted avg       0.63      0.65      0.64      1264
 📊 Naïve Bayes LIAR Performance:
              precision    recall  f1-score   support
    Reliable       0.74      0.98      0.84       926
        Fake       0.55      0.06      0.11       338
    accuracy                           0.74      1264
   macro avg       0.65      0.52      0.48      1264
 weighted avg       0.69      0.74      0.65      1264
--- a/src/fnc1a.py
+++ b/src/fnc1a.py
@@ -32,31 +32,29 @@ except OSError:
 print_log("📖 spaCy model loaded.")
 # Paths
-csv_path = "../data/news_cleaned_2018_02_13.csv"
+input_path = "../data/news_cleaned_2018_02_13"
-parquet_path = "../data/news_cleaned_2018_02_13.parquet"
+output_path = "../data/processed_fakenews"
 output_parquet = "../data/processed_fakenews.parquet"
 output_csv = "../data/processed_fakenews.csv"
 # Convert CSV to Parquet if needed
-if os.path.exists(parquet_path):
+if os.path.exists(input_path + ".parquet"):
-    data_path = parquet_path
+    data_path = input_path + ".parquet" 
-elif os.path.exists(csv_path):
+elif os.path.exists(input_path + ".csv"):
    print_log("🔄 Converting CSV to Parquet...")
    chunksize=1e5
    pqwriter = None
-    for i, df in enumerate(pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip", chunksize=chunksize, usecols=["id", "content", "type"])):
+    for i, df in enumerate(pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip", chunksize=chunksize, usecols=["id", "content", "type"])):
        table = pa.Table.from_pandas(df)
        # If it's the first chunk, create a new parquet writer
        if i == 0:
-            pqwriter = pq.ParquetWriter(parquet_path, table.schema)            
+            pqwriter = pq.ParquetWriter(input_path + ".parquet", table.schema)            
        pqwriter.write_table(table)
    if pqwriter:
        pqwriter.close()
    print_log("✅ Conversion complete.")
-    data_path = parquet_path
+    data_path = input_path + ".parquet"
 else:
    print_log("❌ Error: No dataset found.")
    exit()
@@ -120,12 +118,12 @@ for batch in parquet_file.iter_batches(batch_size):
 # Save processed data
 final_df = pd.concat(processed_chunks, ignore_index=True)
-final_df.to_parquet(output_parquet, index=False)
+final_df.to_parquet(output_path + ".parquet", index=False)
-final_df.to_csv(output_csv, index=False)
+final_df.to_csv(output_path + ".csv", index=False)
-print_log(f"💾 Processed data saved to '{output_parquet}' and '{output_csv}'")
+print_log(f"💾 Processed data saved to '{output_path + ".parquet"}' and '{output_path + ".csv"}'")
-# Print statistics
+# Print statisticsoutput_csv, index=False)
 total_vocab_before = len(vocab_before)
 total_vocab_after_stopwords = len(vocab_after_stopwords)
 total_vocab_after_stemming = len(vocab_after_stemming)
--- a/src/fnc1b.py
+++ b/src/fnc1b.py
@@ -1,13 +1,10 @@
 import random
 import pandas as pd
 import os
 import subprocess
 import pyarrow as pa
 import pyarrow.parquet as pq
-parquet_path = "../data/processed_fakenews.parquet"
+input_path = "../data/processed_fakenews"
-csv_path = "../data/processed_fakenews.csv"
+output_path = "../data/sampled_fakenews"
 sample_path = "../data/sampled_fakenews"
 SAMPLE_FRACTION = 0.1
 RANDOM_SEED = 42  # For reproducibility
@@ -22,57 +19,57 @@ def sample_dataframe(df, total_rows):
    return df.sample(n=sample_size, random_state=RANDOM_SEED)
 # Try to load from Parquet first, fall back to CSV if not available
-if os.path.exists(parquet_path):
+if os.path.exists(input_path + ".parquet"):
-    print(f"🔍 Loading data from Parquet file at '{parquet_path}'")
+    print(f"🔍 Loading data from Parquet file at '{input_path + ".parquet"}'")
    try:
        # Read metadata to get row count without loading entire file
-        parquet_file = pq.ParquetFile(parquet_path)
+        parquet_file = pq.ParquetFile(input_path + ".parquet")
        total_rows = parquet_file.metadata.num_rows
        print(f"🔍 Dataset contains {total_rows:,} rows.")
        # Read and sample the data
-        df_sample = sample_dataframe(pd.read_parquet(parquet_path), total_rows)
+        df_sample = sample_dataframe(pd.read_parquet(input_path + ".parquet"), total_rows)
    except Exception as e:
        print(f"❌ Error reading Parquet file: {e}")
        print("🔄 Falling back to CSV...")
-        if not os.path.exists(csv_path):
+        if not os.path.exists(input_path + ".csv"):
-            print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}")
+            print(f"❌ Error: Neither Parquet nor CSV file found at {input_path + ".parquet"} or {input_path + ".csv"}")
            exit()
        # Get total rows from CSV (Unix-like systems only due to `wc`)
-        total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1
+        total_rows = int(subprocess.check_output(["wc", "-l", input_path + ".csv"]).split()[0]) - 1
        print(f"🔍 Dataset contains {total_rows:,} rows.")
        # Read and sample the data
        df_sample = sample_dataframe(
-            pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"),
+            pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip"),
            total_rows
        )
-elif os.path.exists(csv_path):
+elif os.path.exists(input_path + ".csv"):
-    print(f"🔍 Parquet file not found, loading from CSV at {csv_path}")
+    print(f"🔍 Parquet file not found, loading from CSV at {input_path + ".csv"}")
    # Get total rows from CSV (Unix-like systems only due to `wc`)
-    total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1
+    total_rows = int(subprocess.check_output(["wc", "-l", input_path + ".csv"]).split()[0]) - 1
    print(f"🔍 Dataset contains {total_rows:,} rows.")
    # Read and sample the data
    df_sample = sample_dataframe(
-        pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"),
+        pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip"),
        total_rows
    )
 else:
-    print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}")
+    print(f"❌ Error: Neither Parquet nor CSV file found at {input_path + ".parquet"} or {input_path + ".csv"}")
    exit()
 # Verify the sample size
 print(f"✅ Sample contains {len(df_sample):,} rows (expected {get_sample_size(total_rows=total_rows):,} rows)")
 # Save the sample in both formats
-df_sample.to_csv(f"{sample_path}.csv", index=False)
+df_sample.to_csv(f"{output_path}.csv", index=False)
-df_sample.to_parquet(f"{sample_path}.parquet", index=False)
+df_sample.to_parquet(f"{output_path}.parquet", index=False)
-print(f"💾 Sample saved to '{sample_path}.csv' and '{sample_path}.parquet'.")
+print(f"💾 Sample saved to '{output_path}.csv' and '{output_path}.parquet'.")
 # Split to 80/10/10 and save as both CSV and Parquet
 train_size = int(len(df_sample) * 0.8)
@@ -83,13 +80,13 @@ df_train = df_sample.iloc[:train_size]
 df_valid = df_sample.iloc[train_size:train_size + valid_size]
 df_test = df_sample.iloc[train_size + valid_size:]
-df_train.to_csv(f"{sample_path}_train.csv", index=False)
+df_train.to_csv(f"{output_path}_train.csv", index=False)
-df_valid.to_csv(f"{sample_path}_valid.csv", index=False)
+df_valid.to_csv(f"{output_path}_valid.csv", index=False)
-df_test.to_csv(f"{sample_path}_test.csv", index=False)
+df_test.to_csv(f"{output_path}_test.csv", index=False)
-df_train.to_parquet(f"{sample_path}_train.parquet", index=False)
+df_train.to_parquet(f"{output_path}_train.parquet", index=False)
-df_valid.to_parquet(f"{sample_path}_valid.parquet", index=False)
+df_valid.to_parquet(f"{output_path}_valid.parquet", index=False)
-df_test.to_parquet(f"{sample_path}_test.parquet", index=False)
+df_test.to_parquet(f"{output_path}_test.parquet", index=False)
-print(f"💾 Train/Valid/Test splits saved to '{sample_path}_train.csv', '{sample_path}_valid.csv', '{sample_path}_test.csv'.")
+print(f"💾 Train/Valid/Test splits saved to '{output_path}_train.csv', '{output_path}_valid.csv', '{output_path}_test.csv'.")
--- a/src/fnc2.py
+++ b/src/fnc2.py
@@ -5,6 +5,30 @@ from sklearn.model_selection import GridSearchCV
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.metrics import classification_report
 input_path = "../data/sampled_fakenews"
 # Function to perform hyperparameter tuning, not used in the final script
 def hyperparameter_tuning():
    print("🔍 Hyperparameter tuning...")
    param_grid_lr = {
        'C': [0.1, 1, 10],
        'max_iter': [100, 500, 1000],
        'class_weight': ['balanced', None]
    }
    grid = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3)
    grid.fit(X_train, y_train)
    print("✅ Best Logistic Regression Parameters:", grid.best_params_)
    param_grid_nb = {
        'alpha': [0.1, 0.5, 1.0, 2.0],
        'fit_prior': [True, False]
    }
    grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3) 
    grid_nb.fit(X_val, y_val)
    print("✅ Best Naïve Bayes Parameters:", grid_nb.best_params_)
 #---FAKENEWSCORPUS DATASET---
 # Load parquet first, fall back to CSV if not available
 def load_split(file_prefix, split_name):
    try:
@@ -17,9 +41,9 @@ def load_split(file_prefix, split_name):
        print(f"🔄 Parquet file not found, loading from CSV at '{file_prefix}_{split_name}.csv'")
        return pd.read_csv(f"{file_prefix}_{split_name}.csv")
-train = load_split("../data/sampled_fakenews", "train")
+train = load_split(input_path, "train")
-val = load_split("../data/sampled_fakenews", "valid") 
+val = load_split(input_path, "valid") 
-test = load_split("../data/sampled_fakenews", "test")
+test = load_split(input_path, "test")
 # "Political" and "bias" may not be inherently fake, and "unknown" is neutral
 print("🧮 Grouping into binary classes...")
@@ -46,29 +70,37 @@ nb = MultinomialNB(alpha=0.1, fit_prior=True)
 nb.fit(X_train, y_train)
 y_test_pred_lr = lr.predict(X_test)
-print("\n📊 Logistic Regression Test Performance:")
+print("\n📊 Logistic Regression FakeNewsCorpus Performance:")
 print(classification_report(y_test, y_test_pred_lr, target_names=['Reliable', 'Fake']))
 y_test_pred_nb = nb.predict(X_test)
-print("\n📊 Naïve Bayes Test Performance:")
+print("\n📊 Naïve Bayes FakeNewsCorpus Performance:")
 print(classification_report(y_test, y_test_pred_nb, target_names=['Reliable', 'Fake']))
-# Function to perform hyperparameter tuning, not used in the final script
+#---LIAR DATASET---
-def hyperparameter_tuning():
+# Load the tsv file
-    print("🔍 Hyperparameter tuning...")
+print("📚 Loading LIAR dataset...")
-    param_grid_lr = {
+liar_test = pd.read_csv("../data/liar_test_processed.csv")
-        'C': [0.1, 1, 10],
+
-        'max_iter': [100, 500, 1000],
+# "Political" and "bias" may not be inherently fake, and "unknown" is neutral
-        'class_weight': ['balanced', None]
+print("🧮 Grouping into binary classes...")
-    }
+liar_fake_labels = {'false', 'pants-fire'}
-    grid = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3)
+liar_test['label'] = liar_test.iloc[:, 1].apply(lambda x: 1 if x in liar_fake_labels else 0)
-    grid.fit(X_train, y_train)
+
-    print("✅ Best Logistic Regression Parameters:", grid.best_params_)
+# Check for NaN values in processed_text
 liar_test = liar_test.dropna(subset=['processed_text'])
 # Transform LIAR text using the same TF-IDF vectorizer
 print("🪙 Preprocessing text...")
 X_liar_test = tfidf.transform(liar_test['processed_text'])
 # Logistic Regression
 y_liar_pred_lr = lr.predict(X_liar_test)
 print("\n📊 Logistic Regression LIAR Performance:")
 print(classification_report(liar_test['label'], y_liar_pred_lr, target_names=['Reliable', 'Fake']))
 # Naïve Bayes
 y_liar_pred_nb = nb.predict(X_liar_test)
 print("\n📊 Naïve Bayes LIAR Performance:")
 print(classification_report(liar_test['label'], y_liar_pred_nb, target_names=['Reliable', 'Fake']))
    param_grid_nb = {
        'alpha': [0.1, 0.5, 1.0, 2.0],
        'fit_prior': [True, False]
    }
    grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3) 
    grid_nb.fit(X_val, y_val)
    print("✅ Best Naïve Bayes Parameters:", grid_nb.best_params_)
--- a/src/fnc4a.py
+++ b/src/fnc4a.py
@@ -0,0 +1,38 @@
 import pandas as pd
 from nltk.corpus import stopwords
 from nltk.stem import PorterStemmer
 input_path = "../data/liar"
 output_path = "../data/liar_processed"
 # Initialize preprocessing tools
 stemmer = PorterStemmer()
 stop_words = set(stopwords.words('english'))
 # Apply the same process as FakeNewsCorpus
 def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    # Tokenization
    tokens = [word.lower() for word in text.split() if word.isalpha()]
    # Stopword removal
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)
 # Load LIAR dataset
 print("🔍 Loading LIAR dataset...")
 liar_test = pd.read_csv(input_path + "_test.tsv", sep='\t', header=None)
 # Apply preprocessing (column 2 contains the text statements)
 print("🪙 Preprocessing LIAR text...")
 liar_test['processed_text'] = liar_test[2].apply(preprocess_text)
 # Save preprocessed data
 liar_test.to_csv(output_path + "_test.tsv", index=False)
 print(f"💾 Preprocessed LIAR data saved to '{output_path + "_test.tsv"}'")
--- a/src/fnc4b.py
+++ b/src/fnc4b.py
@@ -0,0 +1,107 @@
 import torch
 from torch.utils.data import Dataset
 import pandas as pd
 import numpy as np
 from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
 from tqdm import tqdm
 from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
 import matplotlib.pyplot as plt
 # 1. Load and preprocess LIAR dataset
 print("📚 Loading LIAR dataset...")
 liar_test = pd.read_csv("../data/liar_test_processed.csv")
 # Binary label mapping (adjust based on your LIAR preprocessing)
 print("🧮 Grouping into binary classes...")
 liar_fake_labels = {'false', 'pants-fire'}  # Update with your actual LIAR labels
 liar_test['label'] = liar_test.iloc[:, 1].apply(lambda x: 1 if x in liar_fake_labels else 0)
 # 2. Load model and tokenizer
 model_path = "./fake_news_bert"
 print(f"⬇️ Loading model from {model_path}...")
 tokenizer = DistilBertTokenizerFast.from_pretrained(model_path, do_lower_case=True)
 model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=2)
 # 3. Tokenization
 print("🪙 Tokenizing text...")
 def tokenize_data(texts, max_length=512):
    results = {'input_ids': [], 'attention_mask': []}
    batch_size = 1000
    for i in tqdm(range(0, len(texts), batch_size, desc="Tokenizing")):
        batch = texts[i:i+batch_size]
        encoded = tokenizer(
            batch,
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt',
            return_attention_mask=True,
            return_token_type_ids=False
        )
        results['input_ids'].append(encoded['input_ids'])
        results['attention_mask'].append(encoded['attention_mask'])
    return {
        'input_ids': torch.cat(results['input_ids']),
        'attention_mask': torch.cat(results['attention_mask'])
    }
 test_encodings = tokenize_data(liar_test['processed_text'].tolist())
 # 4. Dataset Class
 class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels.values, dtype=torch.long)
    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }
    def __len__(self):
        return len(self.labels)
 print("\n📝 Creating dataset...")
 test_dataset = CustomDataset(test_encodings, liar_test['label'])
 # 5. Prediction Function
 def predict(model, dataset, batch_size=32):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    preds = []
    true_labels = []
    for i in tqdm(range(0, len(dataset), batch_size), desc="Predicting"):
        # Get batch
        batch_indices = range(i, min(i+batch_size, len(dataset)))
        batch = [dataset[j] for j in batch_indices]
        # Prepare inputs
        inputs = {
            'input_ids': torch.stack([item['input_ids'] for item in batch]).to(device),
            'attention_mask': torch.stack([item['attention_mask'] for item in batch]).to(device)
        }
        # Predict
        with torch.no_grad():
            outputs = model(**inputs)
        # Store results
        preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
        true_labels.extend([item['labels'] for item in batch])
    return np.array(preds), np.array(true_labels)
 # 6. Run Evaluation
 print("\n🧪 Evaluating on LIAR test set...")
 y_pred, y_true = predict(model, test_dataset)
 # 7. Performance Report
 print("\n📊 DistilBERT Performance on LIAR Dataset:")
 print(classification_report(y_true, y_pred, target_names=['Reliable', 'Fake']))