Add data processing and sampling for fake news dataset

2025-03-26 17:49:39 +02:00
parent 97466edeae
commit 1dc796b59e
7 changed files with 737 additions and 187 deletions
--- a/src/fnc1b.py
+++ b/src/fnc1b.py
@@ -1,182 +1,75 @@
-import numpy as np
+import random
 import pandas as pd
-import spacy
-import nltk
-import matplotlib.pyplot as plt
-from nltk.corpus import stopwords
-from nltk.stem import PorterStemmer
-from collections import Counter
-from pandarallel import pandarallel
-import multiprocessing
 import os
+import subprocess
+import pyarrow as pa
 import pyarrow.parquet as pq

-# Download NLTK stopwords
-nltk.download('stopwords')
+parquet_path = "../data/processed_fakenews.parquet"
+csv_path = "../data/processed_fakenews.csv"
+sample_path = "../data/sampled_fakenews"
+SAMPLE_FRACTION = 0.1
+RANDOM_SEED = 42  # For reproducibility

-# Paths
-csv_path = "sampled_news.csv"
-parquet_path = "sampled_news_sm.parquet"
-output_parquet = "processed_fakenews.parquet"
-output_csv = "processed_fakenews.csv"
+def get_sample_size(total_rows, log=False):
+    sample_size = int(total_rows * SAMPLE_FRACTION)
+    if log:
+        print(f"📉 Reducing dataset from {total_rows:,} to {sample_size:,} rows...")
+    return sample_size

-# Convert CSV to Parquet if needed
+def sample_dataframe(df, total_rows):
+    sample_size = get_sample_size(total_rows=total_rows, log=True)
+    return df.sample(n=sample_size, random_state=RANDOM_SEED)
+
+# Try to load from Parquet first, fall back to CSV if not available
 if os.path.exists(parquet_path):
-    data_path = parquet_path
+    print(f"🔍 Loading data from Parquet file at '{parquet_path}'")
+    try:
+        # Read metadata to get row count without loading entire file
+        parquet_file = pq.ParquetFile(parquet_path)
+        total_rows = parquet_file.metadata.num_rows
+        print(f"🔍 Dataset contains {total_rows:,} rows.")
+        
+        # Read and sample the data
+        df_sample = sample_dataframe(pd.read_parquet(parquet_path), total_rows)
+        
+    except Exception as e:
+        print(f"❌ Error reading Parquet file: {e}")
+        print("🔄 Falling back to CSV...")
+        if not os.path.exists(csv_path):
+            print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}")
+            exit()
+        
+        # Get total rows from CSV (Unix-like systems only due to `wc`)
+        total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1
+        print(f"🔍 Dataset contains {total_rows:,} rows.")
+        
+        # Read and sample the data
+        df_sample = sample_dataframe(
+            pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"),
+            total_rows
+        )
+
 elif os.path.exists(csv_path):
-    print("🔄 Converting CSV to Parquet...")
-    df = pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip", usecols=["id", "content", "type"])
-    df.to_parquet(parquet_path, index=False)
-    print("✅ Conversion complete.")
-    data_path = parquet_path
+    print(f"🔍 Parquet file not found, loading from CSV at {csv_path}")
+    # Get total rows from CSV (Unix-like systems only due to `wc`)
+    total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1
+    print(f"🔍 Dataset contains {total_rows:,} rows.")
+    
+    # Read and sample the data
+    df_sample = sample_dataframe(
+        pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"),
+        total_rows
+    )
 else:
-    print("❌ Error: No dataset found.")
+    print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}")
    exit()

-# Load spaCy model
-print("📚 Loading spaCy model...")
-try:
-    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
-except OSError:
-    import subprocess
-    print("⬇️ Model not found. Downloading...")
-    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
-    nlp = spacy.load("en_core_web_sm")
-print("📖 spaCy model loaded.")
+# Verify the sample size
+print(f"✅ Sample contains {len(df_sample):,} rows (expected {get_sample_size(total_rows=total_rows):,} rows)")

-# Stopwords & Stemmer
-stop_words = set(stopwords.words("english"))
-stemmer = PorterStemmer()
+# Save the sample in both formats
+df_sample.to_csv(f"{sample_path}.csv", index=False)
+df_sample.to_parquet(f"{sample_path}.parquet", index=False)

-# Initialize parallel processing
-pandarallel.initialize(nb_workers=max(1, int(multiprocessing.cpu_count() / 2)), progress_bar=True)
-
-batch_size = 100000
-parquet_file = pq.ParquetFile(data_path)
-
-processed_chunks = []
-vocab_before = Counter()
-vocab_after_stopwords = Counter()
-vocab_after_stemming = Counter()
-total_words_before = 0
-total_words_after_stopwords = 0
-total_words_after_stemming = 0
-
-total_chars_after_stopwords = 0
-total_chars_after_stemming = 0
-
-print("🧮 Processing text in batches...")
-batch_num = 0
-for batch in parquet_file.iter_batches(batch_size):
-    print(f"🔢 Processing batch {batch_num + 1}...")
-    chunk = batch.to_pandas()
-    chunk = chunk.dropna(subset=["content"]).astype({'content': 'string'})
-
-    print("🪙 Tokenizing text...")
-    chunk_tokens = chunk["content"].parallel_apply(lambda text: [word.lower() for word in text.split() if word.isalpha()])
-    for tokens in chunk_tokens:
-        vocab_before.update(tokens)
-        total_words_before += len(tokens)
-
-    print("🚫 Removing stopwords...")
-    chunk_no_stopwords = chunk_tokens.parallel_apply(lambda tokens: [word for word in tokens if word not in stop_words])
-    for tokens in chunk_no_stopwords:
-        vocab_after_stopwords.update(tokens)
-        total_words_after_stopwords += len(tokens)
-        total_chars_after_stopwords += sum(len(word) for word in tokens)
-
-    print("🌱 Applying stemming...")
-    chunk_stemmed = chunk_no_stopwords.parallel_apply(lambda tokens: [stemmer.stem(word) for word in tokens])
-    for tokens in chunk_stemmed:
-        vocab_after_stemming.update(tokens)
-        total_words_after_stemming += len(tokens)
-        total_chars_after_stemming += sum(len(word) for word in tokens)
-
-    print("📝 Joining tokens back to text...")
-    chunk["processed_text"] = chunk_stemmed.parallel_apply(lambda tokens: ' '.join(tokens))
-    processed_chunks.append(chunk[["id", "processed_text", "type"]])
-    batch_num += 1
-
-# Save processed data
-final_df = pd.concat(processed_chunks, ignore_index=True)
-final_df.to_parquet(output_parquet, index=False)
-final_df.to_csv(output_csv, index=False)
-
-print(f"💾 Processed data saved to '{output_parquet}' and '{output_csv}'")
-
-total_vocab_before = len(vocab_before)
-total_vocab_after_stopwords = len(vocab_after_stopwords)
-total_vocab_after_stemming = len(vocab_after_stemming)
-
-total_stopword_reduction = (total_words_before - total_words_after_stopwords) / total_words_before * 100
-print(f"📊 Total words (the raw number of all words in the text, including duplicates): {total_words_before:,}")
-print(f"⏮️ Before stopword removal: {total_words_before:,}")
-print(f"🔻 After stopword removal: {total_words_after_stopwords:,} (-{total_stopword_reduction:.2f}%)")
-
-vocab_stemming_reduction = (total_vocab_after_stopwords - total_vocab_after_stemming) / total_vocab_after_stopwords * 100
-print(f"🫆 Vocabulary (the number of distinct words in the text, ignoring duplicates):")
-print(f"⏮️ Before stemming: {total_vocab_before:,}")
-print(f"🔻 After stemming: {total_vocab_after_stemming:,} (-{vocab_stemming_reduction:.2f}%)")
-
-avg_chars_after_stopwords = total_chars_after_stopwords / total_words_after_stopwords
-avg_chars_after_stemming =  total_chars_after_stemming / total_words_after_stemming
-avg_chars_reduction = (avg_chars_after_stopwords - avg_chars_after_stemming) / avg_chars_after_stopwords * 100
-print(f"📏 Avg. length of retained words:")
-print(f"⏮️ After stopword removal: {avg_chars_after_stopwords:.2f}")
-print(f"🔻 After stemming: {avg_chars_after_stemming:.2f} (-{avg_chars_reduction:.2f}%)")
-
-# Get most frequent words before and after stopword removal & stemming
-def get_most_frequent_words(vocab, top_n=10):
-    return vocab.most_common(top_n)
-
-top_words_before = get_most_frequent_words(vocab_before)
-top_words_after_stopwords = get_most_frequent_words(vocab_after_stopwords)
-top_words_after_stemming = get_most_frequent_words(vocab_after_stemming)
-
-print("📌 Top 10 words before preprocessing:", top_words_before)
-print("📌 Top 10 words after stopword removal:", top_words_after_stopwords)
-print("📌 Top 10 words after stemming:", top_words_after_stemming)
-
-def plot_word_frequencies(vocab_before, vocab_after_stopwords, vocab_after_stemming, top_n=10000):
-    plt.figure(figsize=(12, 7))
-    
-    freq_before = [freq for _, freq in vocab_before.most_common(top_n)]
-    freq_after_stopwords = [freq for _, freq in vocab_after_stopwords.most_common(top_n)]
-    freq_after_stemming = [freq for _, freq in vocab_after_stemming.most_common(top_n)]
-    
-    plt.loglog(range(1, len(freq_before)+1), freq_before, 
-               label='Raw Text', color='royalblue', alpha=0.8, linewidth=2)
-    plt.loglog(range(1, len(freq_after_stopwords)+1), freq_after_stopwords, 
-               label='After Stopword Removal', color='orange', alpha=0.8, linewidth=2)
-    plt.loglog(range(1, len(freq_after_stemming)+1), freq_after_stemming, 
-               label='After Stemming', color='green', alpha=0.8, linewidth=2)
-    
-    # Add Zipf's law reference line
-    zipf_x = np.array(range(1, top_n+1))
-    zipf_y = freq_before[0] / zipf_x
-    plt.plot(zipf_x, zipf_y, 'r--', label="Zipf's Law", alpha=0.5)
-    
-    top_words = [word for word, _ in vocab_before.most_common(5)]
-    for rank, word in enumerate(top_words, 1):
-        freq = vocab_before[word]
-        plt.annotate(word, xy=(rank, freq), xytext=(rank*1.5, freq*1.5),
-                     arrowprops=dict(facecolor='black', shrink=0.05, width=1, headwidth=4),
-                     fontsize=9, bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", lw=1))
-    
-    plt.title('Word Frequency Distribution (Log-Log Scale)', fontsize=14, pad=20)
-    plt.xlabel('Word Rank (Log Scale)', fontsize=12)
-    plt.ylabel('Frequency (Log Scale)', fontsize=12)
-    plt.grid(True, which="both", ls="-", alpha=0.2)
-    plt.legend(fontsize=11)
-    
-    plt.text(0.02, 0.02, 
-             "• Steep drop at left = Stopwords dominate\n"
-             "• Flatter curve after processing = Better balance\n"
-             "• Close to Zipf's line = Natural language pattern", 
-             transform=plt.gca().transAxes, fontsize=10,
-             bbox=dict(boxstyle="round", fc="white", ec="gray", pad=0.4))
-    
-    plt.tight_layout()
-    plt.show()
-
-plot_word_frequencies(vocab_before, vocab_after_stopwords, vocab_after_stemming)
+print(f"💾 Sample saved to '{sample_path}.csv' and '{sample_path}.parquet'.")