Add preprocessing and evaluation for LIAR dataset using DistilBERT

2025-04-03 13:41:14 +03:00
parent 1df0e66bc8
commit 3cf9c715bc
6 changed files with 264 additions and 67 deletions
--- a/archives/fnc2.log
+++ b/archives/fnc2.log
@@ -5,7 +5,7 @@
 🪙 Preprocessing text...
 🔍 Training models...

-📊 Logistic Regression Test Performance:
+📊 Logistic Regression FakeNewsCorpus Performance:
              precision    recall  f1-score   support

    Reliable       0.84      0.90      0.87     54706
@@ -16,7 +16,7 @@
 weighted avg       0.83      0.83      0.83     85290


-📊 Naïve Bayes Test Performance:
+📊 Naïve Bayes FakeNewsCorpus Performance:
              precision    recall  f1-score   support

    Reliable       0.79      0.92      0.85     54706
@@ -25,3 +25,28 @@ weighted avg       0.83      0.83      0.83     85290
    accuracy                           0.79     85290
   macro avg       0.79      0.74      0.76     85290
 weighted avg       0.79      0.79      0.78     85290
+
+📚 Loading LIAR dataset...
+🧮 Grouping into binary classes...
+🪙 Preprocessing text...
+
+📊 Logistic Regression LIAR Performance:
+              precision    recall  f1-score   support
+
+    Reliable       0.75      0.79      0.77       926
+        Fake       0.32      0.26      0.29       338
+
+    accuracy                           0.65      1264
+   macro avg       0.53      0.53      0.53      1264
+weighted avg       0.63      0.65      0.64      1264
+
+
+📊 Naïve Bayes LIAR Performance:
+              precision    recall  f1-score   support
+
+    Reliable       0.74      0.98      0.84       926
+        Fake       0.55      0.06      0.11       338
+
+    accuracy                           0.74      1264
+   macro avg       0.65      0.52      0.48      1264
+weighted avg       0.69      0.74      0.65      1264
--- a/src/fnc1a.py
+++ b/src/fnc1a.py
@@ -32,31 +32,29 @@ except OSError:
 print_log("📖 spaCy model loaded.")

 # Paths
-csv_path = "../data/news_cleaned_2018_02_13.csv"
-parquet_path = "../data/news_cleaned_2018_02_13.parquet"
-output_parquet = "../data/processed_fakenews.parquet"
-output_csv = "../data/processed_fakenews.csv"
+input_path = "../data/news_cleaned_2018_02_13"
+output_path = "../data/processed_fakenews"

 # Convert CSV to Parquet if needed
-if os.path.exists(parquet_path):
-    data_path = parquet_path
-elif os.path.exists(csv_path):
+if os.path.exists(input_path + ".parquet"):
+    data_path = input_path + ".parquet" 
+elif os.path.exists(input_path + ".csv"):
    print_log("🔄 Converting CSV to Parquet...")
    
    chunksize=1e5
    pqwriter = None
-    for i, df in enumerate(pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip", chunksize=chunksize, usecols=["id", "content", "type"])):
+    for i, df in enumerate(pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip", chunksize=chunksize, usecols=["id", "content", "type"])):
        table = pa.Table.from_pandas(df)
        # If it's the first chunk, create a new parquet writer
        if i == 0:
-            pqwriter = pq.ParquetWriter(parquet_path, table.schema)            
+            pqwriter = pq.ParquetWriter(input_path + ".parquet", table.schema)            
        pqwriter.write_table(table)

    if pqwriter:
        pqwriter.close()

    print_log("✅ Conversion complete.")
-    data_path = parquet_path
+    data_path = input_path + ".parquet"
 else:
    print_log("❌ Error: No dataset found.")
    exit()
@@ -120,12 +118,12 @@ for batch in parquet_file.iter_batches(batch_size):

 # Save processed data
 final_df = pd.concat(processed_chunks, ignore_index=True)
-final_df.to_parquet(output_parquet, index=False)
-final_df.to_csv(output_csv, index=False)
+final_df.to_parquet(output_path + ".parquet", index=False)
+final_df.to_csv(output_path + ".csv", index=False)

-print_log(f"💾 Processed data saved to '{output_parquet}' and '{output_csv}'")
+print_log(f"💾 Processed data saved to '{output_path + ".parquet"}' and '{output_path + ".csv"}'")

-# Print statistics
+# Print statisticsoutput_csv, index=False)
 total_vocab_before = len(vocab_before)
 total_vocab_after_stopwords = len(vocab_after_stopwords)
 total_vocab_after_stemming = len(vocab_after_stemming)
--- a/src/fnc1b.py
+++ b/src/fnc1b.py
@@ -1,13 +1,10 @@
-import random
 import pandas as pd
 import os
 import subprocess
-import pyarrow as pa
 import pyarrow.parquet as pq

-parquet_path = "../data/processed_fakenews.parquet"
-csv_path = "../data/processed_fakenews.csv"
-sample_path = "../data/sampled_fakenews"
+input_path = "../data/processed_fakenews"
+output_path = "../data/sampled_fakenews"
 SAMPLE_FRACTION = 0.1
 RANDOM_SEED = 42  # For reproducibility

@@ -22,57 +19,57 @@ def sample_dataframe(df, total_rows):
    return df.sample(n=sample_size, random_state=RANDOM_SEED)

 # Try to load from Parquet first, fall back to CSV if not available
-if os.path.exists(parquet_path):
-    print(f"🔍 Loading data from Parquet file at '{parquet_path}'")
+if os.path.exists(input_path + ".parquet"):
+    print(f"🔍 Loading data from Parquet file at '{input_path + ".parquet"}'")
    try:
        # Read metadata to get row count without loading entire file
-        parquet_file = pq.ParquetFile(parquet_path)
+        parquet_file = pq.ParquetFile(input_path + ".parquet")
        total_rows = parquet_file.metadata.num_rows
        print(f"🔍 Dataset contains {total_rows:,} rows.")
        
        # Read and sample the data
-        df_sample = sample_dataframe(pd.read_parquet(parquet_path), total_rows)
+        df_sample = sample_dataframe(pd.read_parquet(input_path + ".parquet"), total_rows)
        
    except Exception as e:
        print(f"❌ Error reading Parquet file: {e}")
        print("🔄 Falling back to CSV...")
-        if not os.path.exists(csv_path):
-            print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}")
+        if not os.path.exists(input_path + ".csv"):
+            print(f"❌ Error: Neither Parquet nor CSV file found at {input_path + ".parquet"} or {input_path + ".csv"}")
            exit()
        
        # Get total rows from CSV (Unix-like systems only due to `wc`)
-        total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1
+        total_rows = int(subprocess.check_output(["wc", "-l", input_path + ".csv"]).split()[0]) - 1
        print(f"🔍 Dataset contains {total_rows:,} rows.")
        
        # Read and sample the data
        df_sample = sample_dataframe(
-            pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"),
+            pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip"),
            total_rows
        )

-elif os.path.exists(csv_path):
-    print(f"🔍 Parquet file not found, loading from CSV at {csv_path}")
+elif os.path.exists(input_path + ".csv"):
+    print(f"🔍 Parquet file not found, loading from CSV at {input_path + ".csv"}")
    # Get total rows from CSV (Unix-like systems only due to `wc`)
-    total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1
+    total_rows = int(subprocess.check_output(["wc", "-l", input_path + ".csv"]).split()[0]) - 1
    print(f"🔍 Dataset contains {total_rows:,} rows.")
    
    # Read and sample the data
    df_sample = sample_dataframe(
-        pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"),
+        pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip"),
        total_rows
    )
 else:
-    print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}")
+    print(f"❌ Error: Neither Parquet nor CSV file found at {input_path + ".parquet"} or {input_path + ".csv"}")
    exit()

 # Verify the sample size
 print(f"✅ Sample contains {len(df_sample):,} rows (expected {get_sample_size(total_rows=total_rows):,} rows)")

 # Save the sample in both formats
-df_sample.to_csv(f"{sample_path}.csv", index=False)
-df_sample.to_parquet(f"{sample_path}.parquet", index=False)
+df_sample.to_csv(f"{output_path}.csv", index=False)
+df_sample.to_parquet(f"{output_path}.parquet", index=False)

-print(f"💾 Sample saved to '{sample_path}.csv' and '{sample_path}.parquet'.")
+print(f"💾 Sample saved to '{output_path}.csv' and '{output_path}.parquet'.")

 # Split to 80/10/10 and save as both CSV and Parquet
 train_size = int(len(df_sample) * 0.8)
@@ -83,13 +80,13 @@ df_train = df_sample.iloc[:train_size]
 df_valid = df_sample.iloc[train_size:train_size + valid_size]
 df_test = df_sample.iloc[train_size + valid_size:]

-df_train.to_csv(f"{sample_path}_train.csv", index=False)
-df_valid.to_csv(f"{sample_path}_valid.csv", index=False)
-df_test.to_csv(f"{sample_path}_test.csv", index=False)
+df_train.to_csv(f"{output_path}_train.csv", index=False)
+df_valid.to_csv(f"{output_path}_valid.csv", index=False)
+df_test.to_csv(f"{output_path}_test.csv", index=False)

-df_train.to_parquet(f"{sample_path}_train.parquet", index=False)
-df_valid.to_parquet(f"{sample_path}_valid.parquet", index=False)
-df_test.to_parquet(f"{sample_path}_test.parquet", index=False)
+df_train.to_parquet(f"{output_path}_train.parquet", index=False)
+df_valid.to_parquet(f"{output_path}_valid.parquet", index=False)
+df_test.to_parquet(f"{output_path}_test.parquet", index=False)

-print(f"💾 Train/Valid/Test splits saved to '{sample_path}_train.csv', '{sample_path}_valid.csv', '{sample_path}_test.csv'.")
+print(f"💾 Train/Valid/Test splits saved to '{output_path}_train.csv', '{output_path}_valid.csv', '{output_path}_test.csv'.")

--- a/src/fnc2.py
+++ b/src/fnc2.py
@@ -5,6 +5,30 @@ from sklearn.model_selection import GridSearchCV
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.metrics import classification_report

+input_path = "../data/sampled_fakenews"
+
+# Function to perform hyperparameter tuning, not used in the final script
+def hyperparameter_tuning():
+    print("🔍 Hyperparameter tuning...")
+    param_grid_lr = {
+        'C': [0.1, 1, 10],
+        'max_iter': [100, 500, 1000],
+        'class_weight': ['balanced', None]
+    }
+    grid = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3)
+    grid.fit(X_train, y_train)
+    print("✅ Best Logistic Regression Parameters:", grid.best_params_)
+
+    param_grid_nb = {
+        'alpha': [0.1, 0.5, 1.0, 2.0],
+        'fit_prior': [True, False]
+    }
+    grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3) 
+    grid_nb.fit(X_val, y_val)
+    print("✅ Best Naïve Bayes Parameters:", grid_nb.best_params_)
+
+#---FAKENEWSCORPUS DATASET---
+
 # Load parquet first, fall back to CSV if not available
 def load_split(file_prefix, split_name):
    try:
@@ -17,9 +41,9 @@ def load_split(file_prefix, split_name):
        print(f"🔄 Parquet file not found, loading from CSV at '{file_prefix}_{split_name}.csv'")
        return pd.read_csv(f"{file_prefix}_{split_name}.csv")

-train = load_split("../data/sampled_fakenews", "train")
-val = load_split("../data/sampled_fakenews", "valid") 
-test = load_split("../data/sampled_fakenews", "test")
+train = load_split(input_path, "train")
+val = load_split(input_path, "valid") 
+test = load_split(input_path, "test")

 # "Political" and "bias" may not be inherently fake, and "unknown" is neutral
 print("🧮 Grouping into binary classes...")
@@ -46,29 +70,37 @@ nb = MultinomialNB(alpha=0.1, fit_prior=True)
 nb.fit(X_train, y_train)

 y_test_pred_lr = lr.predict(X_test)
-print("\n📊 Logistic Regression Test Performance:")
+print("\n📊 Logistic Regression FakeNewsCorpus Performance:")
 print(classification_report(y_test, y_test_pred_lr, target_names=['Reliable', 'Fake']))

 y_test_pred_nb = nb.predict(X_test)
-print("\n📊 Naïve Bayes Test Performance:")
+print("\n📊 Naïve Bayes FakeNewsCorpus Performance:")
 print(classification_report(y_test, y_test_pred_nb, target_names=['Reliable', 'Fake']))

-# Function to perform hyperparameter tuning, not used in the final script
-def hyperparameter_tuning():
-    print("🔍 Hyperparameter tuning...")
-    param_grid_lr = {
-        'C': [0.1, 1, 10],
-        'max_iter': [100, 500, 1000],
-        'class_weight': ['balanced', None]
-    }
-    grid = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3)
-    grid.fit(X_train, y_train)
-    print("✅ Best Logistic Regression Parameters:", grid.best_params_)
+#---LIAR DATASET---
+# Load the tsv file
+print("📚 Loading LIAR dataset...")
+liar_test = pd.read_csv("../data/liar_test_processed.csv")
+
+# "Political" and "bias" may not be inherently fake, and "unknown" is neutral
+print("🧮 Grouping into binary classes...")
+liar_fake_labels = {'false', 'pants-fire'}
+liar_test['label'] = liar_test.iloc[:, 1].apply(lambda x: 1 if x in liar_fake_labels else 0)
+
+# Check for NaN values in processed_text
+liar_test = liar_test.dropna(subset=['processed_text'])
+
+# Transform LIAR text using the same TF-IDF vectorizer
+print("🪙 Preprocessing text...")
+X_liar_test = tfidf.transform(liar_test['processed_text'])
+
+# Logistic Regression
+y_liar_pred_lr = lr.predict(X_liar_test)
+print("\n📊 Logistic Regression LIAR Performance:")
+print(classification_report(liar_test['label'], y_liar_pred_lr, target_names=['Reliable', 'Fake']))
+
+# Naïve Bayes
+y_liar_pred_nb = nb.predict(X_liar_test)
+print("\n📊 Naïve Bayes LIAR Performance:")
+print(classification_report(liar_test['label'], y_liar_pred_nb, target_names=['Reliable', 'Fake']))

-    param_grid_nb = {
-        'alpha': [0.1, 0.5, 1.0, 2.0],
-        'fit_prior': [True, False]
-    }
-    grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3) 
-    grid_nb.fit(X_val, y_val)
-    print("✅ Best Naïve Bayes Parameters:", grid_nb.best_params_)
--- a/src/fnc4a.py
+++ b/src/fnc4a.py
@@ -0,0 +1,38 @@
+import pandas as pd
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+
+input_path = "../data/liar"
+output_path = "../data/liar_processed"
+
+# Initialize preprocessing tools
+stemmer = PorterStemmer()
+stop_words = set(stopwords.words('english'))
+
+# Apply the same process as FakeNewsCorpus
+def preprocess_text(text):
+    if not isinstance(text, str):
+        return ""
+    
+    # Tokenization
+    tokens = [word.lower() for word in text.split() if word.isalpha()]
+    
+    # Stopword removal
+    tokens = [word for word in tokens if word not in stop_words]
+    
+    # Stemming
+    tokens = [stemmer.stem(word) for word in tokens]
+    
+    return ' '.join(tokens)
+
+# Load LIAR dataset
+print("🔍 Loading LIAR dataset...")
+liar_test = pd.read_csv(input_path + "_test.tsv", sep='\t', header=None)
+
+# Apply preprocessing (column 2 contains the text statements)
+print("🪙 Preprocessing LIAR text...")
+liar_test['processed_text'] = liar_test[2].apply(preprocess_text)
+
+# Save preprocessed data
+liar_test.to_csv(output_path + "_test.tsv", index=False)
+print(f"💾 Preprocessed LIAR data saved to '{output_path + "_test.tsv"}'")
--- a/src/fnc4b.py
+++ b/src/fnc4b.py
@@ -0,0 +1,107 @@
+import torch
+from torch.utils.data import Dataset
+import pandas as pd
+import numpy as np
+from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
+from tqdm import tqdm
+from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
+import matplotlib.pyplot as plt
+
+# 1. Load and preprocess LIAR dataset
+print("📚 Loading LIAR dataset...")
+liar_test = pd.read_csv("../data/liar_test_processed.csv")
+
+# Binary label mapping (adjust based on your LIAR preprocessing)
+print("🧮 Grouping into binary classes...")
+liar_fake_labels = {'false', 'pants-fire'}  # Update with your actual LIAR labels
+liar_test['label'] = liar_test.iloc[:, 1].apply(lambda x: 1 if x in liar_fake_labels else 0)
+
+# 2. Load model and tokenizer
+model_path = "./fake_news_bert"
+print(f"⬇️ Loading model from {model_path}...")
+tokenizer = DistilBertTokenizerFast.from_pretrained(model_path, do_lower_case=True)
+model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=2)
+
+# 3. Tokenization
+print("🪙 Tokenizing text...")
+def tokenize_data(texts, max_length=512):
+    results = {'input_ids': [], 'attention_mask': []}
+    batch_size = 1000
+    
+    for i in tqdm(range(0, len(texts), batch_size, desc="Tokenizing")):
+        batch = texts[i:i+batch_size]
+        encoded = tokenizer(
+            batch,
+            truncation=True,
+            padding='max_length',
+            max_length=max_length,
+            return_tensors='pt',
+            return_attention_mask=True,
+            return_token_type_ids=False
+        )
+        results['input_ids'].append(encoded['input_ids'])
+        results['attention_mask'].append(encoded['attention_mask'])
+    
+    return {
+        'input_ids': torch.cat(results['input_ids']),
+        'attention_mask': torch.cat(results['attention_mask'])
+    }
+
+test_encodings = tokenize_data(liar_test['processed_text'].tolist())
+
+# 4. Dataset Class
+class CustomDataset(Dataset):
+    def __init__(self, encodings, labels):
+        self.encodings = encodings
+        self.labels = torch.tensor(labels.values, dtype=torch.long)
+        
+    def __getitem__(self, idx):
+        return {
+            'input_ids': self.encodings['input_ids'][idx],
+            'attention_mask': self.encodings['attention_mask'][idx],
+            'labels': self.labels[idx]
+        }
+    
+    def __len__(self):
+        return len(self.labels)
+
+print("\n📝 Creating dataset...")
+test_dataset = CustomDataset(test_encodings, liar_test['label'])
+
+# 5. Prediction Function
+def predict(model, dataset, batch_size=32):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    
+    preds = []
+    true_labels = []
+    
+    for i in tqdm(range(0, len(dataset), batch_size), desc="Predicting"):
+        # Get batch
+        batch_indices = range(i, min(i+batch_size, len(dataset)))
+        batch = [dataset[j] for j in batch_indices]
+        
+        # Prepare inputs
+        inputs = {
+            'input_ids': torch.stack([item['input_ids'] for item in batch]).to(device),
+            'attention_mask': torch.stack([item['attention_mask'] for item in batch]).to(device)
+        }
+        
+        # Predict
+        with torch.no_grad():
+            outputs = model(**inputs)
+        
+        # Store results
+        preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
+        true_labels.extend([item['labels'] for item in batch])
+    
+    return np.array(preds), np.array(true_labels)
+
+# 6. Run Evaluation
+print("\n🧪 Evaluating on LIAR test set...")
+y_pred, y_true = predict(model, test_dataset)
+
+# 7. Performance Report
+print("\n📊 DistilBERT Performance on LIAR Dataset:")
+print(classification_report(y_true, y_pred, target_names=['Reliable', 'Fake']))