diff --git a/archives/fnc4b.log b/archives/fnc4b.log new file mode 100644 index 0000000..43e0e5a --- /dev/null +++ b/archives/fnc4b.log @@ -0,0 +1,23 @@ +๐Ÿ“š Loading LIAR dataset... +๐Ÿงฎ Grouping into binary classes... +โฌ‡๏ธ Loading model from C:/Users/andre/OneDrive/Documents/code/fake_news_bert... +Some weights of the model checkpoint at C:/Users/andre/OneDrive/Documents/code/fake_news_bert were not used when initializing DistilBertForSequenceClassification: ['loss_fct.weight'] +- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). +- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). +๐Ÿช™ Tokenizing text... +Tokenizing: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 2/2 [00:00<00:00, 16.52batch/s] + +๐Ÿ“ Creating dataset... + +๐Ÿงช Evaluating on LIAR test set... +Predicting: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 40/40 [00:09<00:00, 4.01it/s] + +๐Ÿ“Š DistilBERT Performance on LIAR Dataset: + precision recall f1-score support + + Reliable 0.74 0.67 0.70 926 + Fake 0.28 0.34 0.31 338 + + accuracy 0.59 1264 + macro avg 0.51 0.51 0.51 1264 +weighted avg 0.61 0.59 0.60 1264 diff --git a/src/fnc4b.py b/src/fnc4b.py index b4c1233..c2025d6 100644 --- a/src/fnc4b.py +++ b/src/fnc4b.py @@ -2,10 +2,9 @@ import torch from torch.utils.data import Dataset import pandas as pd import numpy as np -from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay +from sklearn.metrics import classification_report from tqdm import tqdm from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification -import matplotlib.pyplot as plt # 1. Load and preprocess LIAR dataset print("๐Ÿ“š Loading LIAR dataset...") @@ -16,8 +15,10 @@ print("๐Ÿงฎ Grouping into binary classes...") liar_fake_labels = {'false', 'pants-fire'} # Update with your actual LIAR labels liar_test['label'] = liar_test.iloc[:, 1].apply(lambda x: 1 if x in liar_fake_labels else 0) +liar_test = liar_test.dropna(subset=["processed_text"]) + # 2. Load model and tokenizer -model_path = "./fake_news_bert" +model_path = "../fake_news_bert" print(f"โฌ‡๏ธ Loading model from {model_path}...") tokenizer = DistilBertTokenizerFast.from_pretrained(model_path, do_lower_case=True) model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=2) @@ -28,7 +29,7 @@ def tokenize_data(texts, max_length=512): results = {'input_ids': [], 'attention_mask': []} batch_size = 1000 - for i in tqdm(range(0, len(texts), batch_size, desc="Tokenizing")): + for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing", unit="batch"): batch = texts[i:i+batch_size] encoded = tokenizer( batch, @@ -104,4 +105,4 @@ y_pred, y_true = predict(model, test_dataset) # 7. Performance Report print("\n๐Ÿ“Š DistilBERT Performance on LIAR Dataset:") -print(classification_report(y_true, y_pred, target_names=['Reliable', 'Fake'])) +print(classification_report(y_true, y_pred, target_names=['Reliable', 'Fake'])) \ No newline at end of file