Add logging and evaluation for LIAR dataset using DistilBERT

2025-04-03 15:51:12 +03:00
parent 3cf9c715bc
commit 285fb51f96
2 changed files with 29 additions and 5 deletions
--- a/archives/fnc4b.log
+++ b/archives/fnc4b.log
@@ -0,0 +1,23 @@
+📚 Loading LIAR dataset...
+🧮 Grouping into binary classes...
+⬇️ Loading model from C:/Users/andre/OneDrive/Documents/code/fake_news_bert...
+Some weights of the model checkpoint at C:/Users/andre/OneDrive/Documents/code/fake_news_bert were not used when initializing DistilBertForSequenceClassification: ['loss_fct.weight']
+- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+🪙 Tokenizing text...
+Tokenizing: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 16.52batch/s] 
+
+📝 Creating dataset...
+
+🧪 Evaluating on LIAR test set...
+Predicting: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:09<00:00,  4.01it/s]
+
+📊 DistilBERT Performance on LIAR Dataset:
+              precision    recall  f1-score   support
+
+    Reliable       0.74      0.67      0.70       926
+        Fake       0.28      0.34      0.31       338
+
+    accuracy                           0.59      1264
+   macro avg       0.51      0.51      0.51      1264
+weighted avg       0.61      0.59      0.60      1264
--- a/src/fnc4b.py
+++ b/src/fnc4b.py
@@ -2,10 +2,9 @@ import torch
 from torch.utils.data import Dataset
 import pandas as pd
 import numpy as np
-from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
+from sklearn.metrics import classification_report
 from tqdm import tqdm
 from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
-import matplotlib.pyplot as plt

 # 1. Load and preprocess LIAR dataset
 print("📚 Loading LIAR dataset...")
@@ -16,8 +15,10 @@ print("🧮 Grouping into binary classes...")
 liar_fake_labels = {'false', 'pants-fire'}  # Update with your actual LIAR labels
 liar_test['label'] = liar_test.iloc[:, 1].apply(lambda x: 1 if x in liar_fake_labels else 0)

+liar_test = liar_test.dropna(subset=["processed_text"])
+
 # 2. Load model and tokenizer
-model_path = "./fake_news_bert"
+model_path = "../fake_news_bert"
 print(f"⬇️ Loading model from {model_path}...")
 tokenizer = DistilBertTokenizerFast.from_pretrained(model_path, do_lower_case=True)
 model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=2)
@@ -28,7 +29,7 @@ def tokenize_data(texts, max_length=512):
    results = {'input_ids': [], 'attention_mask': []}
    batch_size = 1000
    
-    for i in tqdm(range(0, len(texts), batch_size, desc="Tokenizing")):
+    for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing", unit="batch"):
        batch = texts[i:i+batch_size]
        encoded = tokenizer(
            batch,