diff --git a/archives/fnc2.log b/archives/fnc2.log new file mode 100644 index 0000000..756a36e --- /dev/null +++ b/archives/fnc2.log @@ -0,0 +1,27 @@ +📚 Loading data from Parquet file at '../data/sampled_fakenews_train.parquet' +📚 Loading data from Parquet file at '../data/sampled_fakenews_valid.parquet' +📚 Loading data from Parquet file at '../data/sampled_fakenews_test.parquet' +🧮 Grouping into binary classes... +🪙 Preprocessing text... +🔍 Training models... + +📊 Logistic Regression Test Performance: + precision recall f1-score support + + Reliable 0.84 0.90 0.87 54706 + Fake 0.80 0.70 0.75 30584 + + accuracy 0.83 85290 + macro avg 0.82 0.80 0.81 85290 +weighted avg 0.83 0.83 0.83 85290 + + +📊 Naïve Bayes Test Performance: + precision recall f1-score support + + Reliable 0.79 0.92 0.85 54706 + Fake 0.79 0.57 0.67 30584 + + accuracy 0.79 85290 + macro avg 0.79 0.74 0.76 85290 +weighted avg 0.79 0.79 0.78 85290 diff --git a/requirements.txt b/requirements.txt index 6e1abf8..0bfa32a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,8 @@ spacy nltk matplotlib pandarallel -pyarrow \ No newline at end of file +pyarrow +scikit-learn +torch +transformers +accelerate \ No newline at end of file diff --git a/src/fnc1b.py b/src/fnc1b.py index 796fecd..4e8bc4f 100644 --- a/src/fnc1b.py +++ b/src/fnc1b.py @@ -72,4 +72,24 @@ print(f"✅ Sample contains {len(df_sample):,} rows (expected {get_sample_size(t df_sample.to_csv(f"{sample_path}.csv", index=False) df_sample.to_parquet(f"{sample_path}.parquet", index=False) -print(f"💾 Sample saved to '{sample_path}.csv' and '{sample_path}.parquet'.") \ No newline at end of file +print(f"💾 Sample saved to '{sample_path}.csv' and '{sample_path}.parquet'.") + +# Split to 80/10/10 and save as both CSV and Parquet +train_size = int(len(df_sample) * 0.8) +valid_size = int(len(df_sample) * 0.1) +test_size = len(df_sample) - (train_size + valid_size) # Ensure the sum is correct + +df_train = df_sample.iloc[:train_size] +df_valid = df_sample.iloc[train_size:train_size + valid_size] +df_test = df_sample.iloc[train_size + valid_size:] + +df_train.to_csv(f"{sample_path}_train.csv", index=False) +df_valid.to_csv(f"{sample_path}_valid.csv", index=False) +df_test.to_csv(f"{sample_path}_test.csv", index=False) + +df_train.to_parquet(f"{sample_path}_train.parquet", index=False) +df_valid.to_parquet(f"{sample_path}_valid.parquet", index=False) +df_test.to_parquet(f"{sample_path}_test.parquet", index=False) + +print(f"💾 Train/Valid/Test splits saved to '{sample_path}_train.csv', '{sample_path}_valid.csv', '{sample_path}_test.csv'.") + diff --git a/src/fnc2.py b/src/fnc2.py new file mode 100644 index 0000000..b6bc04b --- /dev/null +++ b/src/fnc2.py @@ -0,0 +1,74 @@ +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.naive_bayes import MultinomialNB +from sklearn.metrics import classification_report + +# Load parquet first, fall back to CSV if not available +def load_split(file_prefix, split_name): + try: + print(f"📚 Loading data from Parquet file at '{file_prefix}_{split_name}.parquet'") + return pd.read_parquet(f"{file_prefix}_{split_name}.parquet") + except FileNotFoundError: + print(f"❌ Error: Neither Parquet nor CSV file found at {file_prefix}_{split_name}.parquet or {file_prefix}_{split_name}.csv") + exit() + except: + print(f"🔄 Parquet file not found, loading from CSV at '{file_prefix}_{split_name}.csv'") + return pd.read_csv(f"{file_prefix}_{split_name}.csv") + +train = load_split("../data/sampled_fakenews", "train") +val = load_split("../data/sampled_fakenews", "valid") +test = load_split("../data/sampled_fakenews", "test") + +# "Political" and "bias" may not be inherently fake, and "unknown" is neutral +print("🧮 Grouping into binary classes...") +fake_labels = {'fake', 'conspiracy', 'rumor', 'unreliable', 'junksci', 'hate', 'satire', 'clickbait'} +for df in [train, val, test]: + df['label'] = df['type'].apply(lambda x: 1 if x in fake_labels else 0) + +print("🪙 Preprocessing text...") +tfidf = TfidfVectorizer(max_features=5000) + +X_train = tfidf.fit_transform(train['processed_text']) +X_val = tfidf.transform(val['processed_text']) +X_test = tfidf.transform(test['processed_text']) + +y_train = train['label'] +y_val = val['label'] +y_test = test['label'] + +print("🔍 Training models...") +lr = LogisticRegression(C=10, max_iter=100, class_weight=None, random_state=42) +lr.fit(X_train, y_train) + +nb = MultinomialNB(alpha=0.1, fit_prior=True) +nb.fit(X_train, y_train) + +y_test_pred_lr = lr.predict(X_test) +print("\n📊 Logistic Regression Test Performance:") +print(classification_report(y_test, y_test_pred_lr, target_names=['Reliable', 'Fake'])) + +y_test_pred_nb = nb.predict(X_test) +print("\n📊 Naïve Bayes Test Performance:") +print(classification_report(y_test, y_test_pred_nb, target_names=['Reliable', 'Fake'])) + +# Function to perform hyperparameter tuning, not used in the final script +def hyperparameter_tuning(): + print("🔍 Hyperparameter tuning...") + param_grid_lr = { + 'C': [0.1, 1, 10], + 'max_iter': [100, 500, 1000], + 'class_weight': ['balanced', None] + } + grid = GridSearchCV(LogisticRegression(), param_grid_lr, cv=3) + grid.fit(X_train, y_train) + print("✅ Best Logistic Regression Parameters:", grid.best_params_) + + param_grid_nb = { + 'alpha': [0.1, 0.5, 1.0, 2.0], + 'fit_prior': [True, False] + } + grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3) + grid_nb.fit(X_val, y_val) + print("✅ Best Naïve Bayes Parameters:", grid_nb.best_params_)