Add data processing and sampling for fake news dataset

This commit is contained in:
2025-03-26 17:49:39 +02:00
parent 97466edeae
commit 1dc796b59e
7 changed files with 737 additions and 187 deletions

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
pyenv
data

452
archives/fnc1a.log Normal file
View File

@@ -0,0 +1,452 @@
[nltk_data] Downloading package stopwords to
[nltk_data] /home/andrewtrieu/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
[2025-03-26 13:12:19] 📚 Loading spaCy model...
[2025-03-26 13:12:20] 📖 spaCy model loaded.
INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
[2025-03-26 13:12:20] 🧮 Processing text in batches...
[2025-03-26 13:12:21] 🔢 Processing batch 1...
[2025-03-26 13:12:22] 🪙 Tokenizing text...
[2025-03-26 13:12:41] 🚫 Removing stopwords...
[2025-03-26 13:13:09] 🌱 Applying stemming...
[2025-03-26 13:15:28] 📝 Joining tokens back to text...
[2025-03-26 13:15:38] 🔢 Processing batch 2...
[2025-03-26 13:15:39] 🪙 Tokenizing text...
[2025-03-26 13:16:00] 🚫 Removing stopwords...
[2025-03-26 13:16:23] 🌱 Applying stemming...
[2025-03-26 13:18:24] 📝 Joining tokens back to text...
[2025-03-26 13:18:33] 🔢 Processing batch 3...
[2025-03-26 13:18:34] 🪙 Tokenizing text...
[2025-03-26 13:18:55] 🚫 Removing stopwords...
[2025-03-26 13:19:21] 🌱 Applying stemming...
[2025-03-26 13:21:19] 📝 Joining tokens back to text...
[2025-03-26 13:21:27] 🔢 Processing batch 4...
[2025-03-26 13:21:28] 🪙 Tokenizing text...
[2025-03-26 13:21:45] 🚫 Removing stopwords...
[2025-03-26 13:22:08] 🌱 Applying stemming...
[2025-03-26 13:23:58] 📝 Joining tokens back to text...
[2025-03-26 13:24:06] 🔢 Processing batch 5...
[2025-03-26 13:24:07] 🪙 Tokenizing text...
[2025-03-26 13:24:28] 🚫 Removing stopwords...
[2025-03-26 13:24:54] 🌱 Applying stemming...
[2025-03-26 13:26:58] 📝 Joining tokens back to text...
[2025-03-26 13:27:09] 🔢 Processing batch 6...
[2025-03-26 13:27:10] 🪙 Tokenizing text...
[2025-03-26 13:27:31] 🚫 Removing stopwords...
[2025-03-26 13:27:57] 🌱 Applying stemming...
[2025-03-26 13:29:57] 📝 Joining tokens back to text...
[2025-03-26 13:30:07] 🔢 Processing batch 7...
[2025-03-26 13:30:07] 🪙 Tokenizing text...
[2025-03-26 13:30:28] 🚫 Removing stopwords...
[2025-03-26 13:30:55] 🌱 Applying stemming...
[2025-03-26 13:32:51] 📝 Joining tokens back to text...
[2025-03-26 13:33:00] 🔢 Processing batch 8...
[2025-03-26 13:33:00] 🪙 Tokenizing text...
[2025-03-26 13:33:23] 🚫 Removing stopwords...
[2025-03-26 13:33:53] 🌱 Applying stemming...
[2025-03-26 13:36:08] 📝 Joining tokens back to text...
[2025-03-26 13:36:18] 🔢 Processing batch 9...
[2025-03-26 13:36:18] 🪙 Tokenizing text...
[2025-03-26 13:36:40] 🚫 Removing stopwords...
[2025-03-26 13:36:59] 🌱 Applying stemming...
[2025-03-26 13:38:31] 📝 Joining tokens back to text...
[2025-03-26 13:38:38] 🔢 Processing batch 10...
[2025-03-26 13:38:38] 🪙 Tokenizing text...
[2025-03-26 13:38:51] 🚫 Removing stopwords...
[2025-03-26 13:39:06] 🌱 Applying stemming...
[2025-03-26 13:40:20] 📝 Joining tokens back to text...
[2025-03-26 13:40:26] 🔢 Processing batch 11...
[2025-03-26 13:40:26] 🪙 Tokenizing text...
[2025-03-26 13:40:42] 🚫 Removing stopwords...
[2025-03-26 13:41:02] 🌱 Applying stemming...
[2025-03-26 13:42:38] 📝 Joining tokens back to text...
[2025-03-26 13:42:46] 🔢 Processing batch 12...
[2025-03-26 13:42:46] 🪙 Tokenizing text...
[2025-03-26 13:43:05] 🚫 Removing stopwords...
[2025-03-26 13:43:30] 🌱 Applying stemming...
[2025-03-26 13:45:27] 📝 Joining tokens back to text...
[2025-03-26 13:45:36] 🔢 Processing batch 13...
[2025-03-26 13:45:36] 🪙 Tokenizing text...
[2025-03-26 13:45:50] 🚫 Removing stopwords...
[2025-03-26 13:46:07] 🌱 Applying stemming...
[2025-03-26 13:47:20] 📝 Joining tokens back to text...
[2025-03-26 13:47:26] 🔢 Processing batch 14...
[2025-03-26 13:47:27] 🪙 Tokenizing text...
[2025-03-26 13:47:40] 🚫 Removing stopwords...
[2025-03-26 13:47:58] 🌱 Applying stemming...
[2025-03-26 13:49:14] 📝 Joining tokens back to text...
[2025-03-26 13:49:22] 🔢 Processing batch 15...
[2025-03-26 13:49:22] 🪙 Tokenizing text...
[2025-03-26 13:49:43] 🚫 Removing stopwords...
[2025-03-26 13:50:06] 🌱 Applying stemming...
[2025-03-26 13:51:52] 📝 Joining tokens back to text...
[2025-03-26 13:52:00] 🔢 Processing batch 16...
[2025-03-26 13:52:01] 🪙 Tokenizing text...
[2025-03-26 13:52:18] 🚫 Removing stopwords...
[2025-03-26 13:52:39] 🌱 Applying stemming...
[2025-03-26 13:54:14] 📝 Joining tokens back to text...
[2025-03-26 13:54:23] 🔢 Processing batch 17...
[2025-03-26 13:54:23] 🪙 Tokenizing text...
[2025-03-26 13:54:45] 🚫 Removing stopwords...
[2025-03-26 13:55:14] 🌱 Applying stemming...
[2025-03-26 13:57:21] 📝 Joining tokens back to text...
[2025-03-26 13:57:33] 🔢 Processing batch 18...
[2025-03-26 13:57:33] 🪙 Tokenizing text...
[2025-03-26 13:57:56] 🚫 Removing stopwords...
[2025-03-26 13:58:22] 🌱 Applying stemming...
[2025-03-26 14:00:39] 📝 Joining tokens back to text...
[2025-03-26 14:00:49] 🔢 Processing batch 19...
[2025-03-26 14:00:49] 🪙 Tokenizing text...
[2025-03-26 14:01:10] 🚫 Removing stopwords...
[2025-03-26 14:01:34] 🌱 Applying stemming...
[2025-03-26 14:03:27] 📝 Joining tokens back to text...
[2025-03-26 14:03:36] 🔢 Processing batch 20...
[2025-03-26 14:03:36] 🪙 Tokenizing text...
[2025-03-26 14:03:59] 🚫 Removing stopwords...
[2025-03-26 14:04:25] 🌱 Applying stemming...
[2025-03-26 14:06:22] 📝 Joining tokens back to text...
[2025-03-26 14:06:31] 🔢 Processing batch 21...
[2025-03-26 14:06:31] 🪙 Tokenizing text...
[2025-03-26 14:06:53] 🚫 Removing stopwords...
[2025-03-26 14:07:19] 🌱 Applying stemming...
[2025-03-26 14:09:41] 📝 Joining tokens back to text...
[2025-03-26 14:09:51] 🔢 Processing batch 22...
[2025-03-26 14:09:52] 🪙 Tokenizing text...
[2025-03-26 14:10:15] 🚫 Removing stopwords...
[2025-03-26 14:10:45] 🌱 Applying stemming...
[2025-03-26 14:13:15] 📝 Joining tokens back to text...
[2025-03-26 14:13:25] 🔢 Processing batch 23...
[2025-03-26 14:13:25] 🪙 Tokenizing text...
[2025-03-26 14:13:46] 🚫 Removing stopwords...
[2025-03-26 14:14:13] 🌱 Applying stemming...
[2025-03-26 14:16:16] 📝 Joining tokens back to text...
[2025-03-26 14:16:27] 🔢 Processing batch 24...
[2025-03-26 14:16:28] 🪙 Tokenizing text...
[2025-03-26 14:16:56] 🚫 Removing stopwords...
[2025-03-26 14:17:30] 🌱 Applying stemming...
[2025-03-26 14:20:45] 📝 Joining tokens back to text...
[2025-03-26 14:21:00] 🔢 Processing batch 25...
[2025-03-26 14:21:01] 🪙 Tokenizing text...
[2025-03-26 14:21:22] 🚫 Removing stopwords...
[2025-03-26 14:21:49] 🌱 Applying stemming...
[2025-03-26 14:23:36] 📝 Joining tokens back to text...
[2025-03-26 14:23:45] 🔢 Processing batch 26...
[2025-03-26 14:23:46] 🪙 Tokenizing text...
[2025-03-26 14:24:12] 🚫 Removing stopwords...
[2025-03-26 14:24:44] 🌱 Applying stemming...
[2025-03-26 14:26:49] 📝 Joining tokens back to text...
[2025-03-26 14:26:59] 🔢 Processing batch 27...
[2025-03-26 14:27:00] 🪙 Tokenizing text...
[2025-03-26 14:27:25] 🚫 Removing stopwords...
[2025-03-26 14:27:54] 🌱 Applying stemming...
[2025-03-26 14:29:55] 📝 Joining tokens back to text...
[2025-03-26 14:30:06] 🔢 Processing batch 28...
[2025-03-26 14:30:06] 🪙 Tokenizing text...
[2025-03-26 14:30:28] 🚫 Removing stopwords...
[2025-03-26 14:30:54] 🌱 Applying stemming...
[2025-03-26 14:32:39] 📝 Joining tokens back to text...
[2025-03-26 14:32:49] 🔢 Processing batch 29...
[2025-03-26 14:32:49] 🪙 Tokenizing text...
[2025-03-26 14:33:14] 🚫 Removing stopwords...
[2025-03-26 14:33:44] 🌱 Applying stemming...
[2025-03-26 14:36:18] 📝 Joining tokens back to text...
[2025-03-26 14:36:29] 🔢 Processing batch 30...
[2025-03-26 14:36:29] 🪙 Tokenizing text...
[2025-03-26 14:36:52] 🚫 Removing stopwords...
[2025-03-26 14:37:21] 🌱 Applying stemming...
[2025-03-26 14:39:37] 📝 Joining tokens back to text...
[2025-03-26 14:39:47] 🔢 Processing batch 31...
[2025-03-26 14:39:48] 🪙 Tokenizing text...
[2025-03-26 14:40:09] 🚫 Removing stopwords...
[2025-03-26 14:40:37] 🌱 Applying stemming...
[2025-03-26 14:42:33] 📝 Joining tokens back to text...
[2025-03-26 14:42:43] 🔢 Processing batch 32...
[2025-03-26 14:42:44] 🪙 Tokenizing text...
[2025-03-26 14:43:08] 🚫 Removing stopwords...
[2025-03-26 14:43:33] 🌱 Applying stemming...
[2025-03-26 14:45:06] 📝 Joining tokens back to text...
[2025-03-26 14:45:15] 🔢 Processing batch 33...
[2025-03-26 14:45:15] 🪙 Tokenizing text...
[2025-03-26 14:45:36] 🚫 Removing stopwords...
[2025-03-26 14:45:58] 🌱 Applying stemming...
[2025-03-26 14:47:41] 📝 Joining tokens back to text...
[2025-03-26 14:47:50] 🔢 Processing batch 34...
[2025-03-26 14:47:51] 🪙 Tokenizing text...
[2025-03-26 14:48:12] 🚫 Removing stopwords...
[2025-03-26 14:48:39] 🌱 Applying stemming...
[2025-03-26 14:50:40] 📝 Joining tokens back to text...
[2025-03-26 14:50:50] 🔢 Processing batch 35...
[2025-03-26 14:50:50] 🪙 Tokenizing text...
[2025-03-26 14:51:12] 🚫 Removing stopwords...
[2025-03-26 14:51:41] 🌱 Applying stemming...
[2025-03-26 14:53:41] 📝 Joining tokens back to text...
[2025-03-26 14:53:51] 🔢 Processing batch 36...
[2025-03-26 14:53:52] 🪙 Tokenizing text...
[2025-03-26 14:54:10] 🚫 Removing stopwords...
[2025-03-26 14:54:33] 🌱 Applying stemming...
[2025-03-26 14:56:06] 📝 Joining tokens back to text...
[2025-03-26 14:56:15] 🔢 Processing batch 37...
[2025-03-26 14:56:16] 🪙 Tokenizing text...
[2025-03-26 14:56:36] 🚫 Removing stopwords...
[2025-03-26 14:57:03] 🌱 Applying stemming...
[2025-03-26 14:58:44] 📝 Joining tokens back to text...
[2025-03-26 14:58:54] 🔢 Processing batch 38...
[2025-03-26 14:58:55] 🪙 Tokenizing text...
[2025-03-26 14:59:29] 🚫 Removing stopwords...
[2025-03-26 14:59:59] 🌱 Applying stemming...
[2025-03-26 15:02:26] 📝 Joining tokens back to text...
[2025-03-26 15:02:39] 🔢 Processing batch 39...
[2025-03-26 15:02:40] 🪙 Tokenizing text...
[2025-03-26 15:03:07] 🚫 Removing stopwords...
[2025-03-26 15:03:40] 🌱 Applying stemming...
[2025-03-26 15:06:16] 📝 Joining tokens back to text...
[2025-03-26 15:06:27] 🔢 Processing batch 40...
[2025-03-26 15:06:28] 🪙 Tokenizing text...
[2025-03-26 15:06:49] 🚫 Removing stopwords...
[2025-03-26 15:07:14] 🌱 Applying stemming...
[2025-03-26 15:09:02] 📝 Joining tokens back to text...
[2025-03-26 15:09:12] 🔢 Processing batch 41...
[2025-03-26 15:09:13] 🪙 Tokenizing text...
[2025-03-26 15:09:37] 🚫 Removing stopwords...
[2025-03-26 15:10:05] 🌱 Applying stemming...
[2025-03-26 15:12:15] 📝 Joining tokens back to text...
[2025-03-26 15:12:26] 🔢 Processing batch 42...
[2025-03-26 15:12:27] 🪙 Tokenizing text...
[2025-03-26 15:12:50] 🚫 Removing stopwords...
[2025-03-26 15:13:20] 🌱 Applying stemming...
[2025-03-26 15:15:29] 📝 Joining tokens back to text...
[2025-03-26 15:15:40] 🔢 Processing batch 43...
[2025-03-26 15:15:41] 🪙 Tokenizing text...
[2025-03-26 15:16:08] 🚫 Removing stopwords...
[2025-03-26 15:16:39] 🌱 Applying stemming...
[2025-03-26 15:18:47] 📝 Joining tokens back to text...
[2025-03-26 15:19:01] 🔢 Processing batch 44...
[2025-03-26 15:19:02] 🪙 Tokenizing text...
[2025-03-26 15:19:28] 🚫 Removing stopwords...
[2025-03-26 15:19:57] 🌱 Applying stemming...
[2025-03-26 15:21:44] 📝 Joining tokens back to text...
[2025-03-26 15:21:54] 🔢 Processing batch 45...
[2025-03-26 15:21:54] 🪙 Tokenizing text...
[2025-03-26 15:22:14] 🚫 Removing stopwords...
[2025-03-26 15:22:36] 🌱 Applying stemming...
[2025-03-26 15:24:15] 📝 Joining tokens back to text...
[2025-03-26 15:24:28] 🔢 Processing batch 46...
[2025-03-26 15:24:28] 🪙 Tokenizing text...
[2025-03-26 15:24:54] 🚫 Removing stopwords...
[2025-03-26 15:25:24] 🌱 Applying stemming...
[2025-03-26 15:27:19] 📝 Joining tokens back to text...
[2025-03-26 15:27:29] 🔢 Processing batch 47...
[2025-03-26 15:27:29] 🪙 Tokenizing text...
[2025-03-26 15:27:51] 🚫 Removing stopwords...
[2025-03-26 15:28:16] 🌱 Applying stemming...
[2025-03-26 15:29:54] 📝 Joining tokens back to text...
[2025-03-26 15:30:06] 🔢 Processing batch 48...
[2025-03-26 15:30:06] 🪙 Tokenizing text...
[2025-03-26 15:30:28] 🚫 Removing stopwords...
[2025-03-26 15:30:56] 🌱 Applying stemming...
[2025-03-26 15:32:48] 📝 Joining tokens back to text...
[2025-03-26 15:32:59] 🔢 Processing batch 49...
[2025-03-26 15:33:00] 🪙 Tokenizing text...
[2025-03-26 15:33:25] 🚫 Removing stopwords...
[2025-03-26 15:33:52] 🌱 Applying stemming...
[2025-03-26 15:35:40] 📝 Joining tokens back to text...
[2025-03-26 15:35:49] 🔢 Processing batch 50...
[2025-03-26 15:35:49] 🪙 Tokenizing text...
[2025-03-26 15:36:09] 🚫 Removing stopwords...
[2025-03-26 15:36:33] 🌱 Applying stemming...
[2025-03-26 15:38:12] 📝 Joining tokens back to text...
[2025-03-26 15:38:22] 🔢 Processing batch 51...
[2025-03-26 15:38:22] 🪙 Tokenizing text...
[2025-03-26 15:38:42] 🚫 Removing stopwords...
[2025-03-26 15:39:09] 🌱 Applying stemming...
[2025-03-26 15:40:52] 📝 Joining tokens back to text...
[2025-03-26 15:41:02] 🔢 Processing batch 52...
[2025-03-26 15:41:03] 🪙 Tokenizing text...
[2025-03-26 15:41:23] 🚫 Removing stopwords...
[2025-03-26 15:41:49] 🌱 Applying stemming...
[2025-03-26 15:43:36] 📝 Joining tokens back to text...
[2025-03-26 15:43:46] 🔢 Processing batch 53...
[2025-03-26 15:43:47] 🪙 Tokenizing text...
[2025-03-26 15:44:08] 🚫 Removing stopwords...
[2025-03-26 15:44:37] 🌱 Applying stemming...
[2025-03-26 15:46:41] 📝 Joining tokens back to text...
[2025-03-26 15:46:52] 🔢 Processing batch 54...
[2025-03-26 15:46:52] 🪙 Tokenizing text...
[2025-03-26 15:47:14] 🚫 Removing stopwords...
[2025-03-26 15:47:41] 🌱 Applying stemming...
[2025-03-26 15:49:43] 📝 Joining tokens back to text...
[2025-03-26 15:49:54] 🔢 Processing batch 55...
[2025-03-26 15:49:54] 🪙 Tokenizing text...
[2025-03-26 15:50:18] 🚫 Removing stopwords...
[2025-03-26 15:50:48] 🌱 Applying stemming...
[2025-03-26 15:52:41] 📝 Joining tokens back to text...
[2025-03-26 15:52:51] 🔢 Processing batch 56...
[2025-03-26 15:52:51] 🪙 Tokenizing text...
[2025-03-26 15:53:14] 🚫 Removing stopwords...
[2025-03-26 15:53:40] 🌱 Applying stemming...
[2025-03-26 15:55:33] 📝 Joining tokens back to text...
[2025-03-26 15:55:44] 🔢 Processing batch 57...
[2025-03-26 15:55:44] 🪙 Tokenizing text...
[2025-03-26 15:56:07] 🚫 Removing stopwords...
[2025-03-26 15:56:36] 🌱 Applying stemming...
[2025-03-26 15:58:33] 📝 Joining tokens back to text...
[2025-03-26 15:58:44] 🔢 Processing batch 58...
[2025-03-26 15:58:45] 🪙 Tokenizing text...
[2025-03-26 15:59:08] 🚫 Removing stopwords...
[2025-03-26 15:59:38] 🌱 Applying stemming...
[2025-03-26 16:01:33] 📝 Joining tokens back to text...
[2025-03-26 16:01:43] 🔢 Processing batch 59...
[2025-03-26 16:01:43] 🪙 Tokenizing text...
[2025-03-26 16:02:07] 🚫 Removing stopwords...
[2025-03-26 16:02:35] 🌱 Applying stemming...
[2025-03-26 16:04:53] 📝 Joining tokens back to text...
[2025-03-26 16:05:05] 🔢 Processing batch 60...
[2025-03-26 16:05:05] 🪙 Tokenizing text...
[2025-03-26 16:05:23] 🚫 Removing stopwords...
[2025-03-26 16:05:47] 🌱 Applying stemming...
[2025-03-26 16:07:20] 📝 Joining tokens back to text...
[2025-03-26 16:07:29] 🔢 Processing batch 61...
[2025-03-26 16:07:29] 🪙 Tokenizing text...
[2025-03-26 16:07:52] 🚫 Removing stopwords...
[2025-03-26 16:08:12] 🌱 Applying stemming...
[2025-03-26 16:09:52] 📝 Joining tokens back to text...
[2025-03-26 16:10:01] 🔢 Processing batch 62...
[2025-03-26 16:10:01] 🪙 Tokenizing text...
[2025-03-26 16:10:19] 🚫 Removing stopwords...
[2025-03-26 16:10:40] 🌱 Applying stemming...
[2025-03-26 16:12:10] 📝 Joining tokens back to text...
[2025-03-26 16:12:18] 🔢 Processing batch 63...
[2025-03-26 16:12:19] 🪙 Tokenizing text...
[2025-03-26 16:12:35] 🚫 Removing stopwords...
[2025-03-26 16:12:56] 🌱 Applying stemming...
[2025-03-26 16:14:25] 📝 Joining tokens back to text...
[2025-03-26 16:14:35] 🔢 Processing batch 64...
[2025-03-26 16:14:36] 🪙 Tokenizing text...
[2025-03-26 16:15:00] 🚫 Removing stopwords...
[2025-03-26 16:15:29] 🌱 Applying stemming...
[2025-03-26 16:17:47] 📝 Joining tokens back to text...
[2025-03-26 16:17:58] 🔢 Processing batch 65...
[2025-03-26 16:17:58] 🪙 Tokenizing text...
[2025-03-26 16:18:20] 🚫 Removing stopwords...
[2025-03-26 16:18:49] 🌱 Applying stemming...
[2025-03-26 16:20:46] 📝 Joining tokens back to text...
[2025-03-26 16:20:56] 🔢 Processing batch 66...
[2025-03-26 16:20:57] 🪙 Tokenizing text...
[2025-03-26 16:21:20] 🚫 Removing stopwords...
[2025-03-26 16:21:52] 🌱 Applying stemming...
[2025-03-26 16:23:53] 📝 Joining tokens back to text...
[2025-03-26 16:24:05] 🔢 Processing batch 67...
[2025-03-26 16:24:05] 🪙 Tokenizing text...
[2025-03-26 16:24:29] 🚫 Removing stopwords...
[2025-03-26 16:24:51] 🌱 Applying stemming...
[2025-03-26 16:26:26] 📝 Joining tokens back to text...
[2025-03-26 16:26:35] 🔢 Processing batch 68...
[2025-03-26 16:26:35] 🪙 Tokenizing text...
[2025-03-26 16:26:56] 🚫 Removing stopwords...
[2025-03-26 16:27:22] 🌱 Applying stemming...
[2025-03-26 16:29:06] 📝 Joining tokens back to text...
[2025-03-26 16:29:16] 🔢 Processing batch 69...
[2025-03-26 16:29:17] 🪙 Tokenizing text...
[2025-03-26 16:29:35] 🚫 Removing stopwords...
[2025-03-26 16:30:00] 🌱 Applying stemming...
[2025-03-26 16:31:41] 📝 Joining tokens back to text...
[2025-03-26 16:31:51] 🔢 Processing batch 70...
[2025-03-26 16:31:51] 🪙 Tokenizing text...
[2025-03-26 16:32:11] 🚫 Removing stopwords...
[2025-03-26 16:32:35] 🌱 Applying stemming...
[2025-03-26 16:34:13] 📝 Joining tokens back to text...
[2025-03-26 16:34:23] 🔢 Processing batch 71...
[2025-03-26 16:34:23] 🪙 Tokenizing text...
[2025-03-26 16:34:45] 🚫 Removing stopwords...
[2025-03-26 16:35:13] 🌱 Applying stemming...
[2025-03-26 16:36:58] 📝 Joining tokens back to text...
[2025-03-26 16:37:08] 🔢 Processing batch 72...
[2025-03-26 16:37:08] 🪙 Tokenizing text...
[2025-03-26 16:37:33] 🚫 Removing stopwords...
[2025-03-26 16:37:59] 🌱 Applying stemming...
[2025-03-26 16:39:35] 📝 Joining tokens back to text...
[2025-03-26 16:39:45] 🔢 Processing batch 73...
[2025-03-26 16:39:45] 🪙 Tokenizing text...
[2025-03-26 16:40:06] 🚫 Removing stopwords...
[2025-03-26 16:40:32] 🌱 Applying stemming...
[2025-03-26 16:42:13] 📝 Joining tokens back to text...
[2025-03-26 16:42:24] 🔢 Processing batch 74...
[2025-03-26 16:42:25] 🪙 Tokenizing text...
[2025-03-26 16:42:46] 🚫 Removing stopwords...
[2025-03-26 16:43:12] 🌱 Applying stemming...
[2025-03-26 16:44:51] 📝 Joining tokens back to text...
[2025-03-26 16:45:01] 🔢 Processing batch 75...
[2025-03-26 16:45:01] 🪙 Tokenizing text...
[2025-03-26 16:45:23] 🚫 Removing stopwords...
[2025-03-26 16:45:48] 🌱 Applying stemming...
[2025-03-26 16:47:28] 📝 Joining tokens back to text...
[2025-03-26 16:47:39] 🔢 Processing batch 76...
[2025-03-26 16:47:39] 🪙 Tokenizing text...
[2025-03-26 16:48:00] 🚫 Removing stopwords...
[2025-03-26 16:48:31] 🌱 Applying stemming...
[2025-03-26 16:50:15] 📝 Joining tokens back to text...
[2025-03-26 16:50:25] 🔢 Processing batch 77...
[2025-03-26 16:50:25] 🪙 Tokenizing text...
[2025-03-26 16:50:52] 🚫 Removing stopwords...
[2025-03-26 16:51:18] 🌱 Applying stemming...
[2025-03-26 16:53:01] 📝 Joining tokens back to text...
[2025-03-26 16:53:11] 🔢 Processing batch 78...
[2025-03-26 16:53:11] 🪙 Tokenizing text...
[2025-03-26 16:53:33] 🚫 Removing stopwords...
[2025-03-26 16:54:00] 🌱 Applying stemming...
[2025-03-26 16:55:40] 📝 Joining tokens back to text...
[2025-03-26 16:55:50] 🔢 Processing batch 79...
[2025-03-26 16:55:51] 🪙 Tokenizing text...
[2025-03-26 16:56:11] 🚫 Removing stopwords...
[2025-03-26 16:56:38] 🌱 Applying stemming...
[2025-03-26 16:58:24] 📝 Joining tokens back to text...
[2025-03-26 16:58:34] 🔢 Processing batch 80...
[2025-03-26 16:58:35] 🪙 Tokenizing text...
[2025-03-26 16:58:55] 🚫 Removing stopwords...
[2025-03-26 16:59:22] 🌱 Applying stemming...
[2025-03-26 17:01:05] 📝 Joining tokens back to text...
[2025-03-26 17:01:15] 🔢 Processing batch 81...
[2025-03-26 17:01:16] 🪙 Tokenizing text...
[2025-03-26 17:01:37] 🚫 Removing stopwords...
[2025-03-26 17:02:05] 🌱 Applying stemming...
[2025-03-26 17:03:55] 📝 Joining tokens back to text...
[2025-03-26 17:04:06] 🔢 Processing batch 82...
[2025-03-26 17:04:07] 🪙 Tokenizing text...
[2025-03-26 17:04:34] 🚫 Removing stopwords...
[2025-03-26 17:05:08] 🌱 Applying stemming...
[2025-03-26 17:07:03] 📝 Joining tokens back to text...
[2025-03-26 17:07:14] 🔢 Processing batch 83...
[2025-03-26 17:07:15] 🪙 Tokenizing text...
[2025-03-26 17:07:40] 🚫 Removing stopwords...
[2025-03-26 17:08:08] 🌱 Applying stemming...
[2025-03-26 17:10:06] 📝 Joining tokens back to text...
[2025-03-26 17:10:18] 🔢 Processing batch 84...
[2025-03-26 17:10:19] 🪙 Tokenizing text...
[2025-03-26 17:10:46] 🚫 Removing stopwords...
[2025-03-26 17:11:17] 🌱 Applying stemming...
[2025-03-26 17:13:35] 📝 Joining tokens back to text...
[2025-03-26 17:13:48] 🔢 Processing batch 85...
[2025-03-26 17:13:49] 🪙 Tokenizing text...
[2025-03-26 17:14:19] 🚫 Removing stopwords...
[2025-03-26 17:14:53] 🌱 Applying stemming...
[2025-03-26 17:17:16] 📝 Joining tokens back to text...
[2025-03-26 17:17:28] 🔢 Processing batch 86...
[2025-03-26 17:17:28] 🪙 Tokenizing text...
[2025-03-26 17:17:43] 🚫 Removing stopwords...
[2025-03-26 17:17:57] 🌱 Applying stemming...
[2025-03-26 17:18:44] 📝 Joining tokens back to text...
[2025-03-26 17:26:42] 💾 Processed data saved to '../data/processed_fakenews.parquet' and '../data/processed_fakenews.csv'
[2025-03-26 17:26:42] 📊 Total words (the raw number of all words in the text, including duplicates): 3,307,195,209
⏮️ Before stopword removal: 3,307,195,209
🔻 After stopword removal: 1,744,854,554 (-47.24%)
[2025-03-26 17:26:42] 🫆 Vocabulary (the number of distinct words in the text, ignoring duplicates):
⏮️ Before stemming: 2,767,790
🔻 After stemming: 2,415,853 (-12.71%)
[2025-03-26 17:26:42] 📏 Avg. length of retained words:
⏮️ After stopword removal: 6.34
🔻 After stemming: 5.41 (-14.74%)
[2025-03-26 17:27:57] 📌 Top 10 words:
🔝 Before preprocessing: [('the', 220550046), ('of', 106480206), ('to', 106216779), ('and', 96588260), ('a', 83333254), ('in', 73132963), ('that', 45117566), ('is', 42309500), ('for', 36971267), ('on', 29213684)]
🔝 After stopword removal: [('new', 8599601), ('one', 8315754), ('would', 8145653), ('said', 7354978), ('people', 5996078), ('also', 5692918), ('like', 5565201), ('even', 4472256), ('us', 4463611), ('could', 4114863)]
🔝 After stemming: [('new', 8600723), ('one', 8599171), ('would', 8145683), ('said', 7355057), ('like', 6797658), ('state', 6085922), ('peopl', 6060566), ('use', 5697023), ('also', 5692967), ('time', 5630219)]

BIN
archives/fnc1a.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 122 KiB

6
archives/fnc1b.log Normal file
View File

@@ -0,0 +1,6 @@
nohup: ignoring input
🔍 Loading data from Parquet file at 'processed_fakenews.parquet'
🔍 Dataset contains 8,528,956 rows.
📉 Reducing dataset from 8,528,956 to 852,895 rows...
✅ Sample contains 852,895 rows (expected 852,895 rows)
💾 Sample saved to 'sampled_fakenews.csv' and 'sampled_fakenews.parquet'.

View File

@@ -1,27 +1,205 @@
import random
import numpy as np
import pandas as pd
import spacy
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
from pandarallel import pandarallel
import multiprocessing
import os
import subprocess
import pyarrow.parquet as pq
import pyarrow as pa
from datetime import datetime
data_path = "./FNC/news_cleaned_2018_02_13.csv"
sample_path = "sampled_news"
SAMPLE_FRACTION = 0.001 # Use 0.001 for 0.1% of the dataset
# Print log messages with timestamp
def print_log(msg):
print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}")
if not os.path.exists(data_path):
print(f"❌ Error: File not found at {data_path}")
# Download NLTK stopwords
nltk.download('stopwords')
# Load spaCy model
print_log("📚 Loading spaCy model...")
try:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
except OSError:
import subprocess
print_log("⬇️ Model not found. Downloading...")
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
nlp = spacy.load("en_core_web_sm")
print_log("📖 spaCy model loaded.")
# Paths
csv_path = "../data/news_cleaned_2018_02_13.csv"
parquet_path = "../data/news_cleaned_2018_02_13.parquet"
output_parquet = "../data/processed_fakenews.parquet"
output_csv = "../data/processed_fakenews.csv"
# Convert CSV to Parquet if needed
if os.path.exists(parquet_path):
data_path = parquet_path
elif os.path.exists(csv_path):
print_log("🔄 Converting CSV to Parquet...")
chunksize=1e5
pqwriter = None
for i, df in enumerate(pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip", chunksize=chunksize, usecols=["id", "content", "type"])):
table = pa.Table.from_pandas(df)
# If it's the first chunk, create a new parquet writer
if i == 0:
pqwriter = pq.ParquetWriter(parquet_path, table.schema)
pqwriter.write_table(table)
if pqwriter:
pqwriter.close()
print_log("✅ Conversion complete.")
data_path = parquet_path
else:
print_log("❌ Error: No dataset found.")
exit()
# Get total rows. Only works on Unix-like systems due to `wc` command
total_rows = int(subprocess.check_output(["wc", "-l", data_path]).split()[0]) - 1
print(f"🔍 Dataset contains {total_rows:,} rows.")
# Stopwords & Stemmer
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
sample_size = int(total_rows * SAMPLE_FRACTION)
print(f"📉 Reducing dataset to {sample_size:,} rows...")
# Initialize parallel processing
# !WARNING: This will use all available CPU cores, might kill host machine
# Set progress_bar=True to see a progress bar
pandarallel.initialize(nb_workers=max(1, int(multiprocessing.cpu_count())), progress_bar=False)
# Read only a sample
skip_rows = sorted(random.sample(range(1, total_rows + 1), total_rows - sample_size))
df_sample = pd.read_csv(data_path, skiprows=skip_rows, lineterminator="\n", on_bad_lines="skip")
df_sample.to_csv(f"{sample_path}.csv", index=False)
df_sample.to_parquet(f"{sample_path}.parquet", index=False)
batch_size = 100000
parquet_file = pq.ParquetFile(data_path)
print("✅ Sample saved to sampled_news.csv and sampled_news.parquet.")
processed_chunks = []
vocab_before = Counter()
vocab_after_stopwords = Counter()
vocab_after_stemming = Counter()
total_words_before = 0
total_words_after_stopwords = 0
total_words_after_stemming = 0
total_chars_after_stopwords = 0
total_chars_after_stemming = 0
# Process text in batches
print_log("🧮 Processing text in batches...")
batch_num = 0
for batch in parquet_file.iter_batches(batch_size):
print_log(f"🔢 Processing batch {batch_num + 1}...")
chunk = batch.to_pandas()
chunk = chunk.dropna(subset=["content"]).astype({'content': 'string'})
# Tokenize, remove stopwords, and apply stemming
print_log("🪙 Tokenizing text...")
chunk_tokens = chunk["content"].parallel_apply(lambda text: [word.lower() for word in text.split() if word.isalpha()])
for tokens in chunk_tokens:
vocab_before.update(tokens)
total_words_before += len(tokens)
print_log("🚫 Removing stopwords...")
chunk_no_stopwords = chunk_tokens.parallel_apply(lambda tokens: [word for word in tokens if word not in stop_words])
for tokens in chunk_no_stopwords:
vocab_after_stopwords.update(tokens)
total_words_after_stopwords += len(tokens)
total_chars_after_stopwords += sum(len(word) for word in tokens)
print_log("🌱 Applying stemming...")
chunk_stemmed = chunk_no_stopwords.parallel_apply(lambda tokens: [stemmer.stem(word) for word in tokens])
for tokens in chunk_stemmed:
vocab_after_stemming.update(tokens)
total_words_after_stemming += len(tokens)
total_chars_after_stemming += sum(len(word) for word in tokens)
# Join tokens back to text
print_log("📝 Joining tokens back to text...")
chunk["processed_text"] = chunk_stemmed.parallel_apply(lambda tokens: ' '.join(tokens))
processed_chunks.append(chunk[["id", "processed_text", "type"]])
batch_num += 1
# Save processed data
final_df = pd.concat(processed_chunks, ignore_index=True)
final_df.to_parquet(output_parquet, index=False)
final_df.to_csv(output_csv, index=False)
print_log(f"💾 Processed data saved to '{output_parquet}' and '{output_csv}'")
# Print statistics
total_vocab_before = len(vocab_before)
total_vocab_after_stopwords = len(vocab_after_stopwords)
total_vocab_after_stemming = len(vocab_after_stemming)
total_stopword_reduction = (total_words_before - total_words_after_stopwords) / total_words_before * 100
print_log(f"📊 Total words (the raw number of all words in the text, including duplicates): {total_words_before:,}")
print(f"⏮️ Before stopword removal: {total_words_before:,}")
print(f"🔻 After stopword removal: {total_words_after_stopwords:,} (-{total_stopword_reduction:.2f}%)")
vocab_stemming_reduction = (total_vocab_after_stopwords - total_vocab_after_stemming) / total_vocab_after_stopwords * 100
print_log(f"🫆 Vocabulary (the number of distinct words in the text, ignoring duplicates):")
print(f"⏮️ Before stemming: {total_vocab_before:,}")
print(f"🔻 After stemming: {total_vocab_after_stemming:,} (-{vocab_stemming_reduction:.2f}%)")
avg_chars_after_stopwords = total_chars_after_stopwords / total_words_after_stopwords
avg_chars_after_stemming = total_chars_after_stemming / total_words_after_stemming
avg_chars_reduction = (avg_chars_after_stopwords - avg_chars_after_stemming) / avg_chars_after_stopwords * 100
print_log(f"📏 Avg. length of retained words:")
print(f"⏮️ After stopword removal: {avg_chars_after_stopwords:.2f}")
print(f"🔻 After stemming: {avg_chars_after_stemming:.2f} (-{avg_chars_reduction:.2f}%)")
# Get most frequent words before and after stopword removal & stemming
def get_most_frequent_words(vocab, top_n=10):
return vocab.most_common(top_n)
top_words_before = get_most_frequent_words(vocab_before)
top_words_after_stopwords = get_most_frequent_words(vocab_after_stopwords)
top_words_after_stemming = get_most_frequent_words(vocab_after_stemming)
print_log("📌 Top 10 words:")
print("🔝 Before preprocessing:", top_words_before)
print("🔝 After stopword removal:", top_words_after_stopwords)
print("🔝 After stemming:", top_words_after_stemming)
def plot_word_frequencies(vocab_before, vocab_after_stopwords, vocab_after_stemming, top_n=10000):
plt.figure(figsize=(12, 7))
freq_before = [freq for _, freq in vocab_before.most_common(top_n)]
freq_after_stopwords = [freq for _, freq in vocab_after_stopwords.most_common(top_n)]
freq_after_stemming = [freq for _, freq in vocab_after_stemming.most_common(top_n)]
plt.loglog(range(1, len(freq_before)+1), freq_before,
label='Raw Text', color='royalblue', alpha=0.8, linewidth=2)
plt.loglog(range(1, len(freq_after_stopwords)+1), freq_after_stopwords,
label='After Stopword Removal', color='orange', alpha=0.8, linewidth=2)
plt.loglog(range(1, len(freq_after_stemming)+1), freq_after_stemming,
label='After Stemming', color='green', alpha=0.8, linewidth=2)
# Add Zipf's law reference line
zipf_x = np.array(range(1, top_n+1))
zipf_y = freq_before[0] / zipf_x
plt.plot(zipf_x, zipf_y, 'r--', label="Zipf's Law", alpha=0.5)
top_words = [word for word, _ in vocab_before.most_common(5)]
for rank, word in enumerate(top_words, 1):
freq = vocab_before[word]
plt.annotate(word, xy=(rank, freq), xytext=(rank*1.5, freq*1.5),
arrowprops=dict(facecolor='black', shrink=0.05, width=1, headwidth=4),
fontsize=9, bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", lw=1))
plt.title('Word Frequency Distribution (Log-Log Scale)', fontsize=14, pad=20)
plt.xlabel('Word Rank (Log Scale)', fontsize=12)
plt.ylabel('Frequency (Log Scale)', fontsize=12)
plt.grid(True, which="both", ls="-", alpha=0.2)
plt.legend(fontsize=11)
plt.text(0.02, 0.02,
"• Steep drop at left = Stopwords dominate\n"
"• Flatter curve after processing = Better balance\n"
"• Close to Zipf's line = Natural language pattern",
transform=plt.gca().transAxes, fontsize=10,
bbox=dict(boxstyle="round", fc="white", ec="gray", pad=0.4))
plt.tight_layout()
plt.show()
plot_word_frequencies(vocab_before, vocab_after_stopwords, vocab_after_stemming)

View File

@@ -1,182 +1,75 @@
import numpy as np
import random
import pandas as pd
import spacy
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
from pandarallel import pandarallel
import multiprocessing
import os
import subprocess
import pyarrow as pa
import pyarrow.parquet as pq
# Download NLTK stopwords
nltk.download('stopwords')
parquet_path = "../data/processed_fakenews.parquet"
csv_path = "../data/processed_fakenews.csv"
sample_path = "../data/sampled_fakenews"
SAMPLE_FRACTION = 0.1
RANDOM_SEED = 42 # For reproducibility
# Paths
csv_path = "sampled_news.csv"
parquet_path = "sampled_news_sm.parquet"
output_parquet = "processed_fakenews.parquet"
output_csv = "processed_fakenews.csv"
def get_sample_size(total_rows, log=False):
sample_size = int(total_rows * SAMPLE_FRACTION)
if log:
print(f"📉 Reducing dataset from {total_rows:,} to {sample_size:,} rows...")
return sample_size
# Convert CSV to Parquet if needed
def sample_dataframe(df, total_rows):
sample_size = get_sample_size(total_rows=total_rows, log=True)
return df.sample(n=sample_size, random_state=RANDOM_SEED)
# Try to load from Parquet first, fall back to CSV if not available
if os.path.exists(parquet_path):
data_path = parquet_path
elif os.path.exists(csv_path):
print("🔄 Converting CSV to Parquet...")
df = pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip", usecols=["id", "content", "type"])
df.to_parquet(parquet_path, index=False)
print("✅ Conversion complete.")
data_path = parquet_path
else:
print("❌ Error: No dataset found.")
print(f"🔍 Loading data from Parquet file at '{parquet_path}'")
try:
# Read metadata to get row count without loading entire file
parquet_file = pq.ParquetFile(parquet_path)
total_rows = parquet_file.metadata.num_rows
print(f"🔍 Dataset contains {total_rows:,} rows.")
# Read and sample the data
df_sample = sample_dataframe(pd.read_parquet(parquet_path), total_rows)
except Exception as e:
print(f"❌ Error reading Parquet file: {e}")
print("🔄 Falling back to CSV...")
if not os.path.exists(csv_path):
print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}")
exit()
# Load spaCy model
print("📚 Loading spaCy model...")
try:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
except OSError:
import subprocess
print("⬇️ Model not found. Downloading...")
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
nlp = spacy.load("en_core_web_sm")
print("📖 spaCy model loaded.")
# Get total rows from CSV (Unix-like systems only due to `wc`)
total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1
print(f"🔍 Dataset contains {total_rows:,} rows.")
# Stopwords & Stemmer
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
# Read and sample the data
df_sample = sample_dataframe(
pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"),
total_rows
)
# Initialize parallel processing
pandarallel.initialize(nb_workers=max(1, int(multiprocessing.cpu_count() / 2)), progress_bar=True)
elif os.path.exists(csv_path):
print(f"🔍 Parquet file not found, loading from CSV at {csv_path}")
# Get total rows from CSV (Unix-like systems only due to `wc`)
total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1
print(f"🔍 Dataset contains {total_rows:,} rows.")
batch_size = 100000
parquet_file = pq.ParquetFile(data_path)
# Read and sample the data
df_sample = sample_dataframe(
pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"),
total_rows
)
else:
print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}")
exit()
processed_chunks = []
vocab_before = Counter()
vocab_after_stopwords = Counter()
vocab_after_stemming = Counter()
total_words_before = 0
total_words_after_stopwords = 0
total_words_after_stemming = 0
# Verify the sample size
print(f"✅ Sample contains {len(df_sample):,} rows (expected {get_sample_size(total_rows=total_rows):,} rows)")
total_chars_after_stopwords = 0
total_chars_after_stemming = 0
# Save the sample in both formats
df_sample.to_csv(f"{sample_path}.csv", index=False)
df_sample.to_parquet(f"{sample_path}.parquet", index=False)
print("🧮 Processing text in batches...")
batch_num = 0
for batch in parquet_file.iter_batches(batch_size):
print(f"🔢 Processing batch {batch_num + 1}...")
chunk = batch.to_pandas()
chunk = chunk.dropna(subset=["content"]).astype({'content': 'string'})
print("🪙 Tokenizing text...")
chunk_tokens = chunk["content"].parallel_apply(lambda text: [word.lower() for word in text.split() if word.isalpha()])
for tokens in chunk_tokens:
vocab_before.update(tokens)
total_words_before += len(tokens)
print("🚫 Removing stopwords...")
chunk_no_stopwords = chunk_tokens.parallel_apply(lambda tokens: [word for word in tokens if word not in stop_words])
for tokens in chunk_no_stopwords:
vocab_after_stopwords.update(tokens)
total_words_after_stopwords += len(tokens)
total_chars_after_stopwords += sum(len(word) for word in tokens)
print("🌱 Applying stemming...")
chunk_stemmed = chunk_no_stopwords.parallel_apply(lambda tokens: [stemmer.stem(word) for word in tokens])
for tokens in chunk_stemmed:
vocab_after_stemming.update(tokens)
total_words_after_stemming += len(tokens)
total_chars_after_stemming += sum(len(word) for word in tokens)
print("📝 Joining tokens back to text...")
chunk["processed_text"] = chunk_stemmed.parallel_apply(lambda tokens: ' '.join(tokens))
processed_chunks.append(chunk[["id", "processed_text", "type"]])
batch_num += 1
# Save processed data
final_df = pd.concat(processed_chunks, ignore_index=True)
final_df.to_parquet(output_parquet, index=False)
final_df.to_csv(output_csv, index=False)
print(f"💾 Processed data saved to '{output_parquet}' and '{output_csv}'")
total_vocab_before = len(vocab_before)
total_vocab_after_stopwords = len(vocab_after_stopwords)
total_vocab_after_stemming = len(vocab_after_stemming)
total_stopword_reduction = (total_words_before - total_words_after_stopwords) / total_words_before * 100
print(f"📊 Total words (the raw number of all words in the text, including duplicates): {total_words_before:,}")
print(f"⏮️ Before stopword removal: {total_words_before:,}")
print(f"🔻 After stopword removal: {total_words_after_stopwords:,} (-{total_stopword_reduction:.2f}%)")
vocab_stemming_reduction = (total_vocab_after_stopwords - total_vocab_after_stemming) / total_vocab_after_stopwords * 100
print(f"🫆 Vocabulary (the number of distinct words in the text, ignoring duplicates):")
print(f"⏮️ Before stemming: {total_vocab_before:,}")
print(f"🔻 After stemming: {total_vocab_after_stemming:,} (-{vocab_stemming_reduction:.2f}%)")
avg_chars_after_stopwords = total_chars_after_stopwords / total_words_after_stopwords
avg_chars_after_stemming = total_chars_after_stemming / total_words_after_stemming
avg_chars_reduction = (avg_chars_after_stopwords - avg_chars_after_stemming) / avg_chars_after_stopwords * 100
print(f"📏 Avg. length of retained words:")
print(f"⏮️ After stopword removal: {avg_chars_after_stopwords:.2f}")
print(f"🔻 After stemming: {avg_chars_after_stemming:.2f} (-{avg_chars_reduction:.2f}%)")
# Get most frequent words before and after stopword removal & stemming
def get_most_frequent_words(vocab, top_n=10):
return vocab.most_common(top_n)
top_words_before = get_most_frequent_words(vocab_before)
top_words_after_stopwords = get_most_frequent_words(vocab_after_stopwords)
top_words_after_stemming = get_most_frequent_words(vocab_after_stemming)
print("📌 Top 10 words before preprocessing:", top_words_before)
print("📌 Top 10 words after stopword removal:", top_words_after_stopwords)
print("📌 Top 10 words after stemming:", top_words_after_stemming)
def plot_word_frequencies(vocab_before, vocab_after_stopwords, vocab_after_stemming, top_n=10000):
plt.figure(figsize=(12, 7))
freq_before = [freq for _, freq in vocab_before.most_common(top_n)]
freq_after_stopwords = [freq for _, freq in vocab_after_stopwords.most_common(top_n)]
freq_after_stemming = [freq for _, freq in vocab_after_stemming.most_common(top_n)]
plt.loglog(range(1, len(freq_before)+1), freq_before,
label='Raw Text', color='royalblue', alpha=0.8, linewidth=2)
plt.loglog(range(1, len(freq_after_stopwords)+1), freq_after_stopwords,
label='After Stopword Removal', color='orange', alpha=0.8, linewidth=2)
plt.loglog(range(1, len(freq_after_stemming)+1), freq_after_stemming,
label='After Stemming', color='green', alpha=0.8, linewidth=2)
# Add Zipf's law reference line
zipf_x = np.array(range(1, top_n+1))
zipf_y = freq_before[0] / zipf_x
plt.plot(zipf_x, zipf_y, 'r--', label="Zipf's Law", alpha=0.5)
top_words = [word for word, _ in vocab_before.most_common(5)]
for rank, word in enumerate(top_words, 1):
freq = vocab_before[word]
plt.annotate(word, xy=(rank, freq), xytext=(rank*1.5, freq*1.5),
arrowprops=dict(facecolor='black', shrink=0.05, width=1, headwidth=4),
fontsize=9, bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", lw=1))
plt.title('Word Frequency Distribution (Log-Log Scale)', fontsize=14, pad=20)
plt.xlabel('Word Rank (Log Scale)', fontsize=12)
plt.ylabel('Frequency (Log Scale)', fontsize=12)
plt.grid(True, which="both", ls="-", alpha=0.2)
plt.legend(fontsize=11)
plt.text(0.02, 0.02,
"• Steep drop at left = Stopwords dominate\n"
"• Flatter curve after processing = Better balance\n"
"• Close to Zipf's line = Natural language pattern",
transform=plt.gca().transAxes, fontsize=10,
bbox=dict(boxstyle="round", fc="white", ec="gray", pad=0.4))
plt.tight_layout()
plt.show()
plot_word_frequencies(vocab_before, vocab_after_stopwords, vocab_after_stemming)
print(f"💾 Sample saved to '{sample_path}.csv' and '{sample_path}.parquet'.")

19
src/parquet_validator.py Normal file
View File

@@ -0,0 +1,19 @@
# Validate if a parquet file is valid or not, and print out some information about the file.
import pyarrow.parquet as pq
def validate_parquet_file(file_path):
parquet_file = None
try:
parquet_file = pq.ParquetFile(file_path)
print(f"✅ The file '{file_path}' is a valid Parquet file.")
except Exception as e:
print(f"❌ The file '{file_path}' is not a valid Parquet file.")
print(f"Error: {e}")
print(f" - Column Names: {parquet_file.schema}")
print(f" - File Metadata: {parquet_file.metadata}")
# Example usage:
validate_parquet_file("../data/processed_fakenews.parquet")
validate_parquet_file("../data/sampled_fakenews.parquet")