Add preprocessing and evaluation for LIAR dataset using DistilBERT
This commit is contained in:
@@ -5,7 +5,7 @@
|
|||||||
🪙 Preprocessing text...
|
🪙 Preprocessing text...
|
||||||
🔍 Training models...
|
🔍 Training models...
|
||||||
|
|
||||||
📊 Logistic Regression Test Performance:
|
📊 Logistic Regression FakeNewsCorpus Performance:
|
||||||
precision recall f1-score support
|
precision recall f1-score support
|
||||||
|
|
||||||
Reliable 0.84 0.90 0.87 54706
|
Reliable 0.84 0.90 0.87 54706
|
||||||
@@ -16,7 +16,7 @@
|
|||||||
weighted avg 0.83 0.83 0.83 85290
|
weighted avg 0.83 0.83 0.83 85290
|
||||||
|
|
||||||
|
|
||||||
📊 Naïve Bayes Test Performance:
|
📊 Naïve Bayes FakeNewsCorpus Performance:
|
||||||
precision recall f1-score support
|
precision recall f1-score support
|
||||||
|
|
||||||
Reliable 0.79 0.92 0.85 54706
|
Reliable 0.79 0.92 0.85 54706
|
||||||
@@ -25,3 +25,28 @@ weighted avg 0.83 0.83 0.83 85290
|
|||||||
accuracy 0.79 85290
|
accuracy 0.79 85290
|
||||||
macro avg 0.79 0.74 0.76 85290
|
macro avg 0.79 0.74 0.76 85290
|
||||||
weighted avg 0.79 0.79 0.78 85290
|
weighted avg 0.79 0.79 0.78 85290
|
||||||
|
|
||||||
|
📚 Loading LIAR dataset...
|
||||||
|
🧮 Grouping into binary classes...
|
||||||
|
🪙 Preprocessing text...
|
||||||
|
|
||||||
|
📊 Logistic Regression LIAR Performance:
|
||||||
|
precision recall f1-score support
|
||||||
|
|
||||||
|
Reliable 0.75 0.79 0.77 926
|
||||||
|
Fake 0.32 0.26 0.29 338
|
||||||
|
|
||||||
|
accuracy 0.65 1264
|
||||||
|
macro avg 0.53 0.53 0.53 1264
|
||||||
|
weighted avg 0.63 0.65 0.64 1264
|
||||||
|
|
||||||
|
|
||||||
|
📊 Naïve Bayes LIAR Performance:
|
||||||
|
precision recall f1-score support
|
||||||
|
|
||||||
|
Reliable 0.74 0.98 0.84 926
|
||||||
|
Fake 0.55 0.06 0.11 338
|
||||||
|
|
||||||
|
accuracy 0.74 1264
|
||||||
|
macro avg 0.65 0.52 0.48 1264
|
||||||
|
weighted avg 0.69 0.74 0.65 1264
|
||||||
26
src/fnc1a.py
26
src/fnc1a.py
@@ -32,31 +32,29 @@ except OSError:
|
|||||||
print_log("📖 spaCy model loaded.")
|
print_log("📖 spaCy model loaded.")
|
||||||
|
|
||||||
# Paths
|
# Paths
|
||||||
csv_path = "../data/news_cleaned_2018_02_13.csv"
|
input_path = "../data/news_cleaned_2018_02_13"
|
||||||
parquet_path = "../data/news_cleaned_2018_02_13.parquet"
|
output_path = "../data/processed_fakenews"
|
||||||
output_parquet = "../data/processed_fakenews.parquet"
|
|
||||||
output_csv = "../data/processed_fakenews.csv"
|
|
||||||
|
|
||||||
# Convert CSV to Parquet if needed
|
# Convert CSV to Parquet if needed
|
||||||
if os.path.exists(parquet_path):
|
if os.path.exists(input_path + ".parquet"):
|
||||||
data_path = parquet_path
|
data_path = input_path + ".parquet"
|
||||||
elif os.path.exists(csv_path):
|
elif os.path.exists(input_path + ".csv"):
|
||||||
print_log("🔄 Converting CSV to Parquet...")
|
print_log("🔄 Converting CSV to Parquet...")
|
||||||
|
|
||||||
chunksize=1e5
|
chunksize=1e5
|
||||||
pqwriter = None
|
pqwriter = None
|
||||||
for i, df in enumerate(pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip", chunksize=chunksize, usecols=["id", "content", "type"])):
|
for i, df in enumerate(pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip", chunksize=chunksize, usecols=["id", "content", "type"])):
|
||||||
table = pa.Table.from_pandas(df)
|
table = pa.Table.from_pandas(df)
|
||||||
# If it's the first chunk, create a new parquet writer
|
# If it's the first chunk, create a new parquet writer
|
||||||
if i == 0:
|
if i == 0:
|
||||||
pqwriter = pq.ParquetWriter(parquet_path, table.schema)
|
pqwriter = pq.ParquetWriter(input_path + ".parquet", table.schema)
|
||||||
pqwriter.write_table(table)
|
pqwriter.write_table(table)
|
||||||
|
|
||||||
if pqwriter:
|
if pqwriter:
|
||||||
pqwriter.close()
|
pqwriter.close()
|
||||||
|
|
||||||
print_log("✅ Conversion complete.")
|
print_log("✅ Conversion complete.")
|
||||||
data_path = parquet_path
|
data_path = input_path + ".parquet"
|
||||||
else:
|
else:
|
||||||
print_log("❌ Error: No dataset found.")
|
print_log("❌ Error: No dataset found.")
|
||||||
exit()
|
exit()
|
||||||
@@ -120,12 +118,12 @@ for batch in parquet_file.iter_batches(batch_size):
|
|||||||
|
|
||||||
# Save processed data
|
# Save processed data
|
||||||
final_df = pd.concat(processed_chunks, ignore_index=True)
|
final_df = pd.concat(processed_chunks, ignore_index=True)
|
||||||
final_df.to_parquet(output_parquet, index=False)
|
final_df.to_parquet(output_path + ".parquet", index=False)
|
||||||
final_df.to_csv(output_csv, index=False)
|
final_df.to_csv(output_path + ".csv", index=False)
|
||||||
|
|
||||||
print_log(f"💾 Processed data saved to '{output_parquet}' and '{output_csv}'")
|
print_log(f"💾 Processed data saved to '{output_path + ".parquet"}' and '{output_path + ".csv"}'")
|
||||||
|
|
||||||
# Print statistics
|
# Print statisticsoutput_csv, index=False)
|
||||||
total_vocab_before = len(vocab_before)
|
total_vocab_before = len(vocab_before)
|
||||||
total_vocab_after_stopwords = len(vocab_after_stopwords)
|
total_vocab_after_stopwords = len(vocab_after_stopwords)
|
||||||
total_vocab_after_stemming = len(vocab_after_stemming)
|
total_vocab_after_stemming = len(vocab_after_stemming)
|
||||||
|
|||||||
53
src/fnc1b.py
53
src/fnc1b.py
@@ -1,13 +1,10 @@
|
|||||||
import random
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import pyarrow as pa
|
|
||||||
import pyarrow.parquet as pq
|
import pyarrow.parquet as pq
|
||||||
|
|
||||||
parquet_path = "../data/processed_fakenews.parquet"
|
input_path = "../data/processed_fakenews"
|
||||||
csv_path = "../data/processed_fakenews.csv"
|
output_path = "../data/sampled_fakenews"
|
||||||
sample_path = "../data/sampled_fakenews"
|
|
||||||
SAMPLE_FRACTION = 0.1
|
SAMPLE_FRACTION = 0.1
|
||||||
RANDOM_SEED = 42 # For reproducibility
|
RANDOM_SEED = 42 # For reproducibility
|
||||||
|
|
||||||
@@ -22,57 +19,57 @@ def sample_dataframe(df, total_rows):
|
|||||||
return df.sample(n=sample_size, random_state=RANDOM_SEED)
|
return df.sample(n=sample_size, random_state=RANDOM_SEED)
|
||||||
|
|
||||||
# Try to load from Parquet first, fall back to CSV if not available
|
# Try to load from Parquet first, fall back to CSV if not available
|
||||||
if os.path.exists(parquet_path):
|
if os.path.exists(input_path + ".parquet"):
|
||||||
print(f"🔍 Loading data from Parquet file at '{parquet_path}'")
|
print(f"🔍 Loading data from Parquet file at '{input_path + ".parquet"}'")
|
||||||
try:
|
try:
|
||||||
# Read metadata to get row count without loading entire file
|
# Read metadata to get row count without loading entire file
|
||||||
parquet_file = pq.ParquetFile(parquet_path)
|
parquet_file = pq.ParquetFile(input_path + ".parquet")
|
||||||
total_rows = parquet_file.metadata.num_rows
|
total_rows = parquet_file.metadata.num_rows
|
||||||
print(f"🔍 Dataset contains {total_rows:,} rows.")
|
print(f"🔍 Dataset contains {total_rows:,} rows.")
|
||||||
|
|
||||||
# Read and sample the data
|
# Read and sample the data
|
||||||
df_sample = sample_dataframe(pd.read_parquet(parquet_path), total_rows)
|
df_sample = sample_dataframe(pd.read_parquet(input_path + ".parquet"), total_rows)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ Error reading Parquet file: {e}")
|
print(f"❌ Error reading Parquet file: {e}")
|
||||||
print("🔄 Falling back to CSV...")
|
print("🔄 Falling back to CSV...")
|
||||||
if not os.path.exists(csv_path):
|
if not os.path.exists(input_path + ".csv"):
|
||||||
print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}")
|
print(f"❌ Error: Neither Parquet nor CSV file found at {input_path + ".parquet"} or {input_path + ".csv"}")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
# Get total rows from CSV (Unix-like systems only due to `wc`)
|
# Get total rows from CSV (Unix-like systems only due to `wc`)
|
||||||
total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1
|
total_rows = int(subprocess.check_output(["wc", "-l", input_path + ".csv"]).split()[0]) - 1
|
||||||
print(f"🔍 Dataset contains {total_rows:,} rows.")
|
print(f"🔍 Dataset contains {total_rows:,} rows.")
|
||||||
|
|
||||||
# Read and sample the data
|
# Read and sample the data
|
||||||
df_sample = sample_dataframe(
|
df_sample = sample_dataframe(
|
||||||
pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"),
|
pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip"),
|
||||||
total_rows
|
total_rows
|
||||||
)
|
)
|
||||||
|
|
||||||
elif os.path.exists(csv_path):
|
elif os.path.exists(input_path + ".csv"):
|
||||||
print(f"🔍 Parquet file not found, loading from CSV at {csv_path}")
|
print(f"🔍 Parquet file not found, loading from CSV at {input_path + ".csv"}")
|
||||||
# Get total rows from CSV (Unix-like systems only due to `wc`)
|
# Get total rows from CSV (Unix-like systems only due to `wc`)
|
||||||
total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1
|
total_rows = int(subprocess.check_output(["wc", "-l", input_path + ".csv"]).split()[0]) - 1
|
||||||
print(f"🔍 Dataset contains {total_rows:,} rows.")
|
print(f"🔍 Dataset contains {total_rows:,} rows.")
|
||||||
|
|
||||||
# Read and sample the data
|
# Read and sample the data
|
||||||
df_sample = sample_dataframe(
|
df_sample = sample_dataframe(
|
||||||
pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"),
|
pd.read_csv(input_path + ".csv", lineterminator="\n", on_bad_lines="skip"),
|
||||||
total_rows
|
total_rows
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}")
|
print(f"❌ Error: Neither Parquet nor CSV file found at {input_path + ".parquet"} or {input_path + ".csv"}")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
# Verify the sample size
|
# Verify the sample size
|
||||||
print(f"✅ Sample contains {len(df_sample):,} rows (expected {get_sample_size(total_rows=total_rows):,} rows)")
|
print(f"✅ Sample contains {len(df_sample):,} rows (expected {get_sample_size(total_rows=total_rows):,} rows)")
|
||||||
|
|
||||||
# Save the sample in both formats
|
# Save the sample in both formats
|
||||||
df_sample.to_csv(f"{sample_path}.csv", index=False)
|
df_sample.to_csv(f"{output_path}.csv", index=False)
|
||||||
df_sample.to_parquet(f"{sample_path}.parquet", index=False)
|
df_sample.to_parquet(f"{output_path}.parquet", index=False)
|
||||||
|
|
||||||
print(f"💾 Sample saved to '{sample_path}.csv' and '{sample_path}.parquet'.")
|
print(f"💾 Sample saved to '{output_path}.csv' and '{output_path}.parquet'.")
|
||||||
|
|
||||||
# Split to 80/10/10 and save as both CSV and Parquet
|
# Split to 80/10/10 and save as both CSV and Parquet
|
||||||
train_size = int(len(df_sample) * 0.8)
|
train_size = int(len(df_sample) * 0.8)
|
||||||
@@ -83,13 +80,13 @@ df_train = df_sample.iloc[:train_size]
|
|||||||
df_valid = df_sample.iloc[train_size:train_size + valid_size]
|
df_valid = df_sample.iloc[train_size:train_size + valid_size]
|
||||||
df_test = df_sample.iloc[train_size + valid_size:]
|
df_test = df_sample.iloc[train_size + valid_size:]
|
||||||
|
|
||||||
df_train.to_csv(f"{sample_path}_train.csv", index=False)
|
df_train.to_csv(f"{output_path}_train.csv", index=False)
|
||||||
df_valid.to_csv(f"{sample_path}_valid.csv", index=False)
|
df_valid.to_csv(f"{output_path}_valid.csv", index=False)
|
||||||
df_test.to_csv(f"{sample_path}_test.csv", index=False)
|
df_test.to_csv(f"{output_path}_test.csv", index=False)
|
||||||
|
|
||||||
df_train.to_parquet(f"{sample_path}_train.parquet", index=False)
|
df_train.to_parquet(f"{output_path}_train.parquet", index=False)
|
||||||
df_valid.to_parquet(f"{sample_path}_valid.parquet", index=False)
|
df_valid.to_parquet(f"{output_path}_valid.parquet", index=False)
|
||||||
df_test.to_parquet(f"{sample_path}_test.parquet", index=False)
|
df_test.to_parquet(f"{output_path}_test.parquet", index=False)
|
||||||
|
|
||||||
print(f"💾 Train/Valid/Test splits saved to '{sample_path}_train.csv', '{sample_path}_valid.csv', '{sample_path}_test.csv'.")
|
print(f"💾 Train/Valid/Test splits saved to '{output_path}_train.csv', '{output_path}_valid.csv', '{output_path}_test.csv'.")
|
||||||
|
|
||||||
|
|||||||
78
src/fnc2.py
78
src/fnc2.py
@@ -5,6 +5,30 @@ from sklearn.model_selection import GridSearchCV
|
|||||||
from sklearn.naive_bayes import MultinomialNB
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
from sklearn.metrics import classification_report
|
from sklearn.metrics import classification_report
|
||||||
|
|
||||||
|
input_path = "../data/sampled_fakenews"
|
||||||
|
|
||||||
|
# Function to perform hyperparameter tuning, not used in the final script
|
||||||
|
def hyperparameter_tuning():
|
||||||
|
print("🔍 Hyperparameter tuning...")
|
||||||
|
param_grid_lr = {
|
||||||
|
'C': [0.1, 1, 10],
|
||||||
|
'max_iter': [100, 500, 1000],
|
||||||
|
'class_weight': ['balanced', None]
|
||||||
|
}
|
||||||
|
grid = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3)
|
||||||
|
grid.fit(X_train, y_train)
|
||||||
|
print("✅ Best Logistic Regression Parameters:", grid.best_params_)
|
||||||
|
|
||||||
|
param_grid_nb = {
|
||||||
|
'alpha': [0.1, 0.5, 1.0, 2.0],
|
||||||
|
'fit_prior': [True, False]
|
||||||
|
}
|
||||||
|
grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3)
|
||||||
|
grid_nb.fit(X_val, y_val)
|
||||||
|
print("✅ Best Naïve Bayes Parameters:", grid_nb.best_params_)
|
||||||
|
|
||||||
|
#---FAKENEWSCORPUS DATASET---
|
||||||
|
|
||||||
# Load parquet first, fall back to CSV if not available
|
# Load parquet first, fall back to CSV if not available
|
||||||
def load_split(file_prefix, split_name):
|
def load_split(file_prefix, split_name):
|
||||||
try:
|
try:
|
||||||
@@ -17,9 +41,9 @@ def load_split(file_prefix, split_name):
|
|||||||
print(f"🔄 Parquet file not found, loading from CSV at '{file_prefix}_{split_name}.csv'")
|
print(f"🔄 Parquet file not found, loading from CSV at '{file_prefix}_{split_name}.csv'")
|
||||||
return pd.read_csv(f"{file_prefix}_{split_name}.csv")
|
return pd.read_csv(f"{file_prefix}_{split_name}.csv")
|
||||||
|
|
||||||
train = load_split("../data/sampled_fakenews", "train")
|
train = load_split(input_path, "train")
|
||||||
val = load_split("../data/sampled_fakenews", "valid")
|
val = load_split(input_path, "valid")
|
||||||
test = load_split("../data/sampled_fakenews", "test")
|
test = load_split(input_path, "test")
|
||||||
|
|
||||||
# "Political" and "bias" may not be inherently fake, and "unknown" is neutral
|
# "Political" and "bias" may not be inherently fake, and "unknown" is neutral
|
||||||
print("🧮 Grouping into binary classes...")
|
print("🧮 Grouping into binary classes...")
|
||||||
@@ -46,29 +70,37 @@ nb = MultinomialNB(alpha=0.1, fit_prior=True)
|
|||||||
nb.fit(X_train, y_train)
|
nb.fit(X_train, y_train)
|
||||||
|
|
||||||
y_test_pred_lr = lr.predict(X_test)
|
y_test_pred_lr = lr.predict(X_test)
|
||||||
print("\n📊 Logistic Regression Test Performance:")
|
print("\n📊 Logistic Regression FakeNewsCorpus Performance:")
|
||||||
print(classification_report(y_test, y_test_pred_lr, target_names=['Reliable', 'Fake']))
|
print(classification_report(y_test, y_test_pred_lr, target_names=['Reliable', 'Fake']))
|
||||||
|
|
||||||
y_test_pred_nb = nb.predict(X_test)
|
y_test_pred_nb = nb.predict(X_test)
|
||||||
print("\n📊 Naïve Bayes Test Performance:")
|
print("\n📊 Naïve Bayes FakeNewsCorpus Performance:")
|
||||||
print(classification_report(y_test, y_test_pred_nb, target_names=['Reliable', 'Fake']))
|
print(classification_report(y_test, y_test_pred_nb, target_names=['Reliable', 'Fake']))
|
||||||
|
|
||||||
# Function to perform hyperparameter tuning, not used in the final script
|
#---LIAR DATASET---
|
||||||
def hyperparameter_tuning():
|
# Load the tsv file
|
||||||
print("🔍 Hyperparameter tuning...")
|
print("📚 Loading LIAR dataset...")
|
||||||
param_grid_lr = {
|
liar_test = pd.read_csv("../data/liar_test_processed.csv")
|
||||||
'C': [0.1, 1, 10],
|
|
||||||
'max_iter': [100, 500, 1000],
|
# "Political" and "bias" may not be inherently fake, and "unknown" is neutral
|
||||||
'class_weight': ['balanced', None]
|
print("🧮 Grouping into binary classes...")
|
||||||
}
|
liar_fake_labels = {'false', 'pants-fire'}
|
||||||
grid = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3)
|
liar_test['label'] = liar_test.iloc[:, 1].apply(lambda x: 1 if x in liar_fake_labels else 0)
|
||||||
grid.fit(X_train, y_train)
|
|
||||||
print("✅ Best Logistic Regression Parameters:", grid.best_params_)
|
# Check for NaN values in processed_text
|
||||||
|
liar_test = liar_test.dropna(subset=['processed_text'])
|
||||||
|
|
||||||
|
# Transform LIAR text using the same TF-IDF vectorizer
|
||||||
|
print("🪙 Preprocessing text...")
|
||||||
|
X_liar_test = tfidf.transform(liar_test['processed_text'])
|
||||||
|
|
||||||
|
# Logistic Regression
|
||||||
|
y_liar_pred_lr = lr.predict(X_liar_test)
|
||||||
|
print("\n📊 Logistic Regression LIAR Performance:")
|
||||||
|
print(classification_report(liar_test['label'], y_liar_pred_lr, target_names=['Reliable', 'Fake']))
|
||||||
|
|
||||||
|
# Naïve Bayes
|
||||||
|
y_liar_pred_nb = nb.predict(X_liar_test)
|
||||||
|
print("\n📊 Naïve Bayes LIAR Performance:")
|
||||||
|
print(classification_report(liar_test['label'], y_liar_pred_nb, target_names=['Reliable', 'Fake']))
|
||||||
|
|
||||||
param_grid_nb = {
|
|
||||||
'alpha': [0.1, 0.5, 1.0, 2.0],
|
|
||||||
'fit_prior': [True, False]
|
|
||||||
}
|
|
||||||
grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3)
|
|
||||||
grid_nb.fit(X_val, y_val)
|
|
||||||
print("✅ Best Naïve Bayes Parameters:", grid_nb.best_params_)
|
|
||||||
|
|||||||
38
src/fnc4a.py
Normal file
38
src/fnc4a.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
|
||||||
|
input_path = "../data/liar"
|
||||||
|
output_path = "../data/liar_processed"
|
||||||
|
|
||||||
|
# Initialize preprocessing tools
|
||||||
|
stemmer = PorterStemmer()
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
|
||||||
|
# Apply the same process as FakeNewsCorpus
|
||||||
|
def preprocess_text(text):
|
||||||
|
if not isinstance(text, str):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Tokenization
|
||||||
|
tokens = [word.lower() for word in text.split() if word.isalpha()]
|
||||||
|
|
||||||
|
# Stopword removal
|
||||||
|
tokens = [word for word in tokens if word not in stop_words]
|
||||||
|
|
||||||
|
# Stemming
|
||||||
|
tokens = [stemmer.stem(word) for word in tokens]
|
||||||
|
|
||||||
|
return ' '.join(tokens)
|
||||||
|
|
||||||
|
# Load LIAR dataset
|
||||||
|
print("🔍 Loading LIAR dataset...")
|
||||||
|
liar_test = pd.read_csv(input_path + "_test.tsv", sep='\t', header=None)
|
||||||
|
|
||||||
|
# Apply preprocessing (column 2 contains the text statements)
|
||||||
|
print("🪙 Preprocessing LIAR text...")
|
||||||
|
liar_test['processed_text'] = liar_test[2].apply(preprocess_text)
|
||||||
|
|
||||||
|
# Save preprocessed data
|
||||||
|
liar_test.to_csv(output_path + "_test.tsv", index=False)
|
||||||
|
print(f"💾 Preprocessed LIAR data saved to '{output_path + "_test.tsv"}'")
|
||||||
107
src/fnc4b.py
Normal file
107
src/fnc4b.py
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
import torch
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
|
||||||
|
from tqdm import tqdm
|
||||||
|
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
# 1. Load and preprocess LIAR dataset
|
||||||
|
print("📚 Loading LIAR dataset...")
|
||||||
|
liar_test = pd.read_csv("../data/liar_test_processed.csv")
|
||||||
|
|
||||||
|
# Binary label mapping (adjust based on your LIAR preprocessing)
|
||||||
|
print("🧮 Grouping into binary classes...")
|
||||||
|
liar_fake_labels = {'false', 'pants-fire'} # Update with your actual LIAR labels
|
||||||
|
liar_test['label'] = liar_test.iloc[:, 1].apply(lambda x: 1 if x in liar_fake_labels else 0)
|
||||||
|
|
||||||
|
# 2. Load model and tokenizer
|
||||||
|
model_path = "./fake_news_bert"
|
||||||
|
print(f"⬇️ Loading model from {model_path}...")
|
||||||
|
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path, do_lower_case=True)
|
||||||
|
model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=2)
|
||||||
|
|
||||||
|
# 3. Tokenization
|
||||||
|
print("🪙 Tokenizing text...")
|
||||||
|
def tokenize_data(texts, max_length=512):
|
||||||
|
results = {'input_ids': [], 'attention_mask': []}
|
||||||
|
batch_size = 1000
|
||||||
|
|
||||||
|
for i in tqdm(range(0, len(texts), batch_size, desc="Tokenizing")):
|
||||||
|
batch = texts[i:i+batch_size]
|
||||||
|
encoded = tokenizer(
|
||||||
|
batch,
|
||||||
|
truncation=True,
|
||||||
|
padding='max_length',
|
||||||
|
max_length=max_length,
|
||||||
|
return_tensors='pt',
|
||||||
|
return_attention_mask=True,
|
||||||
|
return_token_type_ids=False
|
||||||
|
)
|
||||||
|
results['input_ids'].append(encoded['input_ids'])
|
||||||
|
results['attention_mask'].append(encoded['attention_mask'])
|
||||||
|
|
||||||
|
return {
|
||||||
|
'input_ids': torch.cat(results['input_ids']),
|
||||||
|
'attention_mask': torch.cat(results['attention_mask'])
|
||||||
|
}
|
||||||
|
|
||||||
|
test_encodings = tokenize_data(liar_test['processed_text'].tolist())
|
||||||
|
|
||||||
|
# 4. Dataset Class
|
||||||
|
class CustomDataset(Dataset):
|
||||||
|
def __init__(self, encodings, labels):
|
||||||
|
self.encodings = encodings
|
||||||
|
self.labels = torch.tensor(labels.values, dtype=torch.long)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
return {
|
||||||
|
'input_ids': self.encodings['input_ids'][idx],
|
||||||
|
'attention_mask': self.encodings['attention_mask'][idx],
|
||||||
|
'labels': self.labels[idx]
|
||||||
|
}
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.labels)
|
||||||
|
|
||||||
|
print("\n📝 Creating dataset...")
|
||||||
|
test_dataset = CustomDataset(test_encodings, liar_test['label'])
|
||||||
|
|
||||||
|
# 5. Prediction Function
|
||||||
|
def predict(model, dataset, batch_size=32):
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
model.to(device)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
preds = []
|
||||||
|
true_labels = []
|
||||||
|
|
||||||
|
for i in tqdm(range(0, len(dataset), batch_size), desc="Predicting"):
|
||||||
|
# Get batch
|
||||||
|
batch_indices = range(i, min(i+batch_size, len(dataset)))
|
||||||
|
batch = [dataset[j] for j in batch_indices]
|
||||||
|
|
||||||
|
# Prepare inputs
|
||||||
|
inputs = {
|
||||||
|
'input_ids': torch.stack([item['input_ids'] for item in batch]).to(device),
|
||||||
|
'attention_mask': torch.stack([item['attention_mask'] for item in batch]).to(device)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Predict
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs)
|
||||||
|
|
||||||
|
# Store results
|
||||||
|
preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
|
||||||
|
true_labels.extend([item['labels'] for item in batch])
|
||||||
|
|
||||||
|
return np.array(preds), np.array(true_labels)
|
||||||
|
|
||||||
|
# 6. Run Evaluation
|
||||||
|
print("\n🧪 Evaluating on LIAR test set...")
|
||||||
|
y_pred, y_true = predict(model, test_dataset)
|
||||||
|
|
||||||
|
# 7. Performance Report
|
||||||
|
print("\n📊 DistilBERT Performance on LIAR Dataset:")
|
||||||
|
print(classification_report(y_true, y_pred, target_names=['Reliable', 'Fake']))
|
||||||
Reference in New Issue
Block a user