Add data splitting and simple model
This commit is contained in:
27
archives/fnc2.log
Normal file
27
archives/fnc2.log
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
📚 Loading data from Parquet file at '../data/sampled_fakenews_train.parquet'
|
||||||
|
📚 Loading data from Parquet file at '../data/sampled_fakenews_valid.parquet'
|
||||||
|
📚 Loading data from Parquet file at '../data/sampled_fakenews_test.parquet'
|
||||||
|
🧮 Grouping into binary classes...
|
||||||
|
🪙 Preprocessing text...
|
||||||
|
🔍 Training models...
|
||||||
|
|
||||||
|
📊 Logistic Regression Test Performance:
|
||||||
|
precision recall f1-score support
|
||||||
|
|
||||||
|
Reliable 0.84 0.90 0.87 54706
|
||||||
|
Fake 0.80 0.70 0.75 30584
|
||||||
|
|
||||||
|
accuracy 0.83 85290
|
||||||
|
macro avg 0.82 0.80 0.81 85290
|
||||||
|
weighted avg 0.83 0.83 0.83 85290
|
||||||
|
|
||||||
|
|
||||||
|
📊 Naïve Bayes Test Performance:
|
||||||
|
precision recall f1-score support
|
||||||
|
|
||||||
|
Reliable 0.79 0.92 0.85 54706
|
||||||
|
Fake 0.79 0.57 0.67 30584
|
||||||
|
|
||||||
|
accuracy 0.79 85290
|
||||||
|
macro avg 0.79 0.74 0.76 85290
|
||||||
|
weighted avg 0.79 0.79 0.78 85290
|
||||||
@@ -4,4 +4,8 @@ spacy
|
|||||||
nltk
|
nltk
|
||||||
matplotlib
|
matplotlib
|
||||||
pandarallel
|
pandarallel
|
||||||
pyarrow
|
pyarrow
|
||||||
|
scikit-learn
|
||||||
|
torch
|
||||||
|
transformers
|
||||||
|
accelerate
|
||||||
22
src/fnc1b.py
22
src/fnc1b.py
@@ -72,4 +72,24 @@ print(f"✅ Sample contains {len(df_sample):,} rows (expected {get_sample_size(t
|
|||||||
df_sample.to_csv(f"{sample_path}.csv", index=False)
|
df_sample.to_csv(f"{sample_path}.csv", index=False)
|
||||||
df_sample.to_parquet(f"{sample_path}.parquet", index=False)
|
df_sample.to_parquet(f"{sample_path}.parquet", index=False)
|
||||||
|
|
||||||
print(f"💾 Sample saved to '{sample_path}.csv' and '{sample_path}.parquet'.")
|
print(f"💾 Sample saved to '{sample_path}.csv' and '{sample_path}.parquet'.")
|
||||||
|
|
||||||
|
# Split to 80/10/10 and save as both CSV and Parquet
|
||||||
|
train_size = int(len(df_sample) * 0.8)
|
||||||
|
valid_size = int(len(df_sample) * 0.1)
|
||||||
|
test_size = len(df_sample) - (train_size + valid_size) # Ensure the sum is correct
|
||||||
|
|
||||||
|
df_train = df_sample.iloc[:train_size]
|
||||||
|
df_valid = df_sample.iloc[train_size:train_size + valid_size]
|
||||||
|
df_test = df_sample.iloc[train_size + valid_size:]
|
||||||
|
|
||||||
|
df_train.to_csv(f"{sample_path}_train.csv", index=False)
|
||||||
|
df_valid.to_csv(f"{sample_path}_valid.csv", index=False)
|
||||||
|
df_test.to_csv(f"{sample_path}_test.csv", index=False)
|
||||||
|
|
||||||
|
df_train.to_parquet(f"{sample_path}_train.parquet", index=False)
|
||||||
|
df_valid.to_parquet(f"{sample_path}_valid.parquet", index=False)
|
||||||
|
df_test.to_parquet(f"{sample_path}_test.parquet", index=False)
|
||||||
|
|
||||||
|
print(f"💾 Train/Valid/Test splits saved to '{sample_path}_train.csv', '{sample_path}_valid.csv', '{sample_path}_test.csv'.")
|
||||||
|
|
||||||
|
|||||||
74
src/fnc2.py
Normal file
74
src/fnc2.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.metrics import classification_report
|
||||||
|
|
||||||
|
# Load parquet first, fall back to CSV if not available
|
||||||
|
def load_split(file_prefix, split_name):
|
||||||
|
try:
|
||||||
|
print(f"📚 Loading data from Parquet file at '{file_prefix}_{split_name}.parquet'")
|
||||||
|
return pd.read_parquet(f"{file_prefix}_{split_name}.parquet")
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"❌ Error: Neither Parquet nor CSV file found at {file_prefix}_{split_name}.parquet or {file_prefix}_{split_name}.csv")
|
||||||
|
exit()
|
||||||
|
except:
|
||||||
|
print(f"🔄 Parquet file not found, loading from CSV at '{file_prefix}_{split_name}.csv'")
|
||||||
|
return pd.read_csv(f"{file_prefix}_{split_name}.csv")
|
||||||
|
|
||||||
|
train = load_split("../data/sampled_fakenews", "train")
|
||||||
|
val = load_split("../data/sampled_fakenews", "valid")
|
||||||
|
test = load_split("../data/sampled_fakenews", "test")
|
||||||
|
|
||||||
|
# "Political" and "bias" may not be inherently fake, and "unknown" is neutral
|
||||||
|
print("🧮 Grouping into binary classes...")
|
||||||
|
fake_labels = {'fake', 'conspiracy', 'rumor', 'unreliable', 'junksci', 'hate', 'satire', 'clickbait'}
|
||||||
|
for df in [train, val, test]:
|
||||||
|
df['label'] = df['type'].apply(lambda x: 1 if x in fake_labels else 0)
|
||||||
|
|
||||||
|
print("🪙 Preprocessing text...")
|
||||||
|
tfidf = TfidfVectorizer(max_features=5000)
|
||||||
|
|
||||||
|
X_train = tfidf.fit_transform(train['processed_text'])
|
||||||
|
X_val = tfidf.transform(val['processed_text'])
|
||||||
|
X_test = tfidf.transform(test['processed_text'])
|
||||||
|
|
||||||
|
y_train = train['label']
|
||||||
|
y_val = val['label']
|
||||||
|
y_test = test['label']
|
||||||
|
|
||||||
|
print("🔍 Training models...")
|
||||||
|
lr = LogisticRegression(C=10, max_iter=100, class_weight=None, random_state=42)
|
||||||
|
lr.fit(X_train, y_train)
|
||||||
|
|
||||||
|
nb = MultinomialNB(alpha=0.1, fit_prior=True)
|
||||||
|
nb.fit(X_train, y_train)
|
||||||
|
|
||||||
|
y_test_pred_lr = lr.predict(X_test)
|
||||||
|
print("\n📊 Logistic Regression Test Performance:")
|
||||||
|
print(classification_report(y_test, y_test_pred_lr, target_names=['Reliable', 'Fake']))
|
||||||
|
|
||||||
|
y_test_pred_nb = nb.predict(X_test)
|
||||||
|
print("\n📊 Naïve Bayes Test Performance:")
|
||||||
|
print(classification_report(y_test, y_test_pred_nb, target_names=['Reliable', 'Fake']))
|
||||||
|
|
||||||
|
# Function to perform hyperparameter tuning, not used in the final script
|
||||||
|
def hyperparameter_tuning():
|
||||||
|
print("🔍 Hyperparameter tuning...")
|
||||||
|
param_grid_lr = {
|
||||||
|
'C': [0.1, 1, 10],
|
||||||
|
'max_iter': [100, 500, 1000],
|
||||||
|
'class_weight': ['balanced', None]
|
||||||
|
}
|
||||||
|
grid = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3)
|
||||||
|
grid.fit(X_train, y_train)
|
||||||
|
print("✅ Best Logistic Regression Parameters:", grid.best_params_)
|
||||||
|
|
||||||
|
param_grid_nb = {
|
||||||
|
'alpha': [0.1, 0.5, 1.0, 2.0],
|
||||||
|
'fit_prior': [True, False]
|
||||||
|
}
|
||||||
|
grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3)
|
||||||
|
grid_nb.fit(X_val, y_val)
|
||||||
|
print("✅ Best Naïve Bayes Parameters:", grid_nb.best_params_)
|
||||||
Reference in New Issue
Block a user