diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b820773 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +pyenv +data diff --git a/archives/fnc1a.log b/archives/fnc1a.log new file mode 100644 index 0000000..436b05a --- /dev/null +++ b/archives/fnc1a.log @@ -0,0 +1,452 @@ +[nltk_data] Downloading package stopwords to +[nltk_data] /home/andrewtrieu/nltk_data... +[nltk_data] Package stopwords is already up-to-date! +[2025-03-26 13:12:19] 📚 Loading spaCy model... +[2025-03-26 13:12:20] 📖 spaCy model loaded. +INFO: Pandarallel will run on 6 workers. +INFO: Pandarallel will use Memory file system to transfer data between the main process and workers. +[2025-03-26 13:12:20] 🧮 Processing text in batches... +[2025-03-26 13:12:21] 🔢 Processing batch 1... +[2025-03-26 13:12:22] 🪙 Tokenizing text... +[2025-03-26 13:12:41] 🚫 Removing stopwords... +[2025-03-26 13:13:09] 🌱 Applying stemming... +[2025-03-26 13:15:28] 📝 Joining tokens back to text... +[2025-03-26 13:15:38] 🔢 Processing batch 2... +[2025-03-26 13:15:39] 🪙 Tokenizing text... +[2025-03-26 13:16:00] 🚫 Removing stopwords... +[2025-03-26 13:16:23] 🌱 Applying stemming... +[2025-03-26 13:18:24] 📝 Joining tokens back to text... +[2025-03-26 13:18:33] 🔢 Processing batch 3... +[2025-03-26 13:18:34] 🪙 Tokenizing text... +[2025-03-26 13:18:55] 🚫 Removing stopwords... +[2025-03-26 13:19:21] 🌱 Applying stemming... +[2025-03-26 13:21:19] 📝 Joining tokens back to text... +[2025-03-26 13:21:27] 🔢 Processing batch 4... +[2025-03-26 13:21:28] 🪙 Tokenizing text... +[2025-03-26 13:21:45] 🚫 Removing stopwords... +[2025-03-26 13:22:08] 🌱 Applying stemming... +[2025-03-26 13:23:58] 📝 Joining tokens back to text... +[2025-03-26 13:24:06] 🔢 Processing batch 5... +[2025-03-26 13:24:07] 🪙 Tokenizing text... +[2025-03-26 13:24:28] 🚫 Removing stopwords... +[2025-03-26 13:24:54] 🌱 Applying stemming... +[2025-03-26 13:26:58] 📝 Joining tokens back to text... +[2025-03-26 13:27:09] 🔢 Processing batch 6... +[2025-03-26 13:27:10] 🪙 Tokenizing text... +[2025-03-26 13:27:31] 🚫 Removing stopwords... +[2025-03-26 13:27:57] 🌱 Applying stemming... +[2025-03-26 13:29:57] 📝 Joining tokens back to text... +[2025-03-26 13:30:07] 🔢 Processing batch 7... +[2025-03-26 13:30:07] 🪙 Tokenizing text... +[2025-03-26 13:30:28] 🚫 Removing stopwords... +[2025-03-26 13:30:55] 🌱 Applying stemming... +[2025-03-26 13:32:51] 📝 Joining tokens back to text... +[2025-03-26 13:33:00] 🔢 Processing batch 8... +[2025-03-26 13:33:00] 🪙 Tokenizing text... +[2025-03-26 13:33:23] 🚫 Removing stopwords... +[2025-03-26 13:33:53] 🌱 Applying stemming... +[2025-03-26 13:36:08] 📝 Joining tokens back to text... +[2025-03-26 13:36:18] 🔢 Processing batch 9... +[2025-03-26 13:36:18] 🪙 Tokenizing text... +[2025-03-26 13:36:40] 🚫 Removing stopwords... +[2025-03-26 13:36:59] 🌱 Applying stemming... +[2025-03-26 13:38:31] 📝 Joining tokens back to text... +[2025-03-26 13:38:38] 🔢 Processing batch 10... +[2025-03-26 13:38:38] 🪙 Tokenizing text... +[2025-03-26 13:38:51] 🚫 Removing stopwords... +[2025-03-26 13:39:06] 🌱 Applying stemming... +[2025-03-26 13:40:20] 📝 Joining tokens back to text... +[2025-03-26 13:40:26] 🔢 Processing batch 11... +[2025-03-26 13:40:26] 🪙 Tokenizing text... +[2025-03-26 13:40:42] 🚫 Removing stopwords... +[2025-03-26 13:41:02] 🌱 Applying stemming... +[2025-03-26 13:42:38] 📝 Joining tokens back to text... +[2025-03-26 13:42:46] 🔢 Processing batch 12... +[2025-03-26 13:42:46] 🪙 Tokenizing text... +[2025-03-26 13:43:05] 🚫 Removing stopwords... +[2025-03-26 13:43:30] 🌱 Applying stemming... +[2025-03-26 13:45:27] 📝 Joining tokens back to text... +[2025-03-26 13:45:36] 🔢 Processing batch 13... +[2025-03-26 13:45:36] 🪙 Tokenizing text... +[2025-03-26 13:45:50] 🚫 Removing stopwords... +[2025-03-26 13:46:07] 🌱 Applying stemming... +[2025-03-26 13:47:20] 📝 Joining tokens back to text... +[2025-03-26 13:47:26] 🔢 Processing batch 14... +[2025-03-26 13:47:27] 🪙 Tokenizing text... +[2025-03-26 13:47:40] 🚫 Removing stopwords... +[2025-03-26 13:47:58] 🌱 Applying stemming... +[2025-03-26 13:49:14] 📝 Joining tokens back to text... +[2025-03-26 13:49:22] 🔢 Processing batch 15... +[2025-03-26 13:49:22] 🪙 Tokenizing text... +[2025-03-26 13:49:43] 🚫 Removing stopwords... +[2025-03-26 13:50:06] 🌱 Applying stemming... +[2025-03-26 13:51:52] 📝 Joining tokens back to text... +[2025-03-26 13:52:00] 🔢 Processing batch 16... +[2025-03-26 13:52:01] 🪙 Tokenizing text... +[2025-03-26 13:52:18] 🚫 Removing stopwords... +[2025-03-26 13:52:39] 🌱 Applying stemming... +[2025-03-26 13:54:14] 📝 Joining tokens back to text... +[2025-03-26 13:54:23] 🔢 Processing batch 17... +[2025-03-26 13:54:23] 🪙 Tokenizing text... +[2025-03-26 13:54:45] 🚫 Removing stopwords... +[2025-03-26 13:55:14] 🌱 Applying stemming... +[2025-03-26 13:57:21] 📝 Joining tokens back to text... +[2025-03-26 13:57:33] 🔢 Processing batch 18... +[2025-03-26 13:57:33] 🪙 Tokenizing text... +[2025-03-26 13:57:56] 🚫 Removing stopwords... +[2025-03-26 13:58:22] 🌱 Applying stemming... +[2025-03-26 14:00:39] 📝 Joining tokens back to text... +[2025-03-26 14:00:49] 🔢 Processing batch 19... +[2025-03-26 14:00:49] 🪙 Tokenizing text... +[2025-03-26 14:01:10] 🚫 Removing stopwords... +[2025-03-26 14:01:34] 🌱 Applying stemming... +[2025-03-26 14:03:27] 📝 Joining tokens back to text... +[2025-03-26 14:03:36] 🔢 Processing batch 20... +[2025-03-26 14:03:36] 🪙 Tokenizing text... +[2025-03-26 14:03:59] 🚫 Removing stopwords... +[2025-03-26 14:04:25] 🌱 Applying stemming... +[2025-03-26 14:06:22] 📝 Joining tokens back to text... +[2025-03-26 14:06:31] 🔢 Processing batch 21... +[2025-03-26 14:06:31] 🪙 Tokenizing text... +[2025-03-26 14:06:53] 🚫 Removing stopwords... +[2025-03-26 14:07:19] 🌱 Applying stemming... +[2025-03-26 14:09:41] 📝 Joining tokens back to text... +[2025-03-26 14:09:51] 🔢 Processing batch 22... +[2025-03-26 14:09:52] 🪙 Tokenizing text... +[2025-03-26 14:10:15] 🚫 Removing stopwords... +[2025-03-26 14:10:45] 🌱 Applying stemming... +[2025-03-26 14:13:15] 📝 Joining tokens back to text... +[2025-03-26 14:13:25] 🔢 Processing batch 23... +[2025-03-26 14:13:25] 🪙 Tokenizing text... +[2025-03-26 14:13:46] 🚫 Removing stopwords... +[2025-03-26 14:14:13] 🌱 Applying stemming... +[2025-03-26 14:16:16] 📝 Joining tokens back to text... +[2025-03-26 14:16:27] 🔢 Processing batch 24... +[2025-03-26 14:16:28] 🪙 Tokenizing text... +[2025-03-26 14:16:56] 🚫 Removing stopwords... +[2025-03-26 14:17:30] 🌱 Applying stemming... +[2025-03-26 14:20:45] 📝 Joining tokens back to text... +[2025-03-26 14:21:00] 🔢 Processing batch 25... +[2025-03-26 14:21:01] 🪙 Tokenizing text... +[2025-03-26 14:21:22] 🚫 Removing stopwords... +[2025-03-26 14:21:49] 🌱 Applying stemming... +[2025-03-26 14:23:36] 📝 Joining tokens back to text... +[2025-03-26 14:23:45] 🔢 Processing batch 26... +[2025-03-26 14:23:46] 🪙 Tokenizing text... +[2025-03-26 14:24:12] 🚫 Removing stopwords... +[2025-03-26 14:24:44] 🌱 Applying stemming... +[2025-03-26 14:26:49] 📝 Joining tokens back to text... +[2025-03-26 14:26:59] 🔢 Processing batch 27... +[2025-03-26 14:27:00] 🪙 Tokenizing text... +[2025-03-26 14:27:25] 🚫 Removing stopwords... +[2025-03-26 14:27:54] 🌱 Applying stemming... +[2025-03-26 14:29:55] 📝 Joining tokens back to text... +[2025-03-26 14:30:06] 🔢 Processing batch 28... +[2025-03-26 14:30:06] 🪙 Tokenizing text... +[2025-03-26 14:30:28] 🚫 Removing stopwords... +[2025-03-26 14:30:54] 🌱 Applying stemming... +[2025-03-26 14:32:39] 📝 Joining tokens back to text... +[2025-03-26 14:32:49] 🔢 Processing batch 29... +[2025-03-26 14:32:49] 🪙 Tokenizing text... +[2025-03-26 14:33:14] 🚫 Removing stopwords... +[2025-03-26 14:33:44] 🌱 Applying stemming... +[2025-03-26 14:36:18] 📝 Joining tokens back to text... +[2025-03-26 14:36:29] 🔢 Processing batch 30... +[2025-03-26 14:36:29] 🪙 Tokenizing text... +[2025-03-26 14:36:52] 🚫 Removing stopwords... +[2025-03-26 14:37:21] 🌱 Applying stemming... +[2025-03-26 14:39:37] 📝 Joining tokens back to text... +[2025-03-26 14:39:47] 🔢 Processing batch 31... +[2025-03-26 14:39:48] 🪙 Tokenizing text... +[2025-03-26 14:40:09] 🚫 Removing stopwords... +[2025-03-26 14:40:37] 🌱 Applying stemming... +[2025-03-26 14:42:33] 📝 Joining tokens back to text... +[2025-03-26 14:42:43] 🔢 Processing batch 32... +[2025-03-26 14:42:44] 🪙 Tokenizing text... +[2025-03-26 14:43:08] 🚫 Removing stopwords... +[2025-03-26 14:43:33] 🌱 Applying stemming... +[2025-03-26 14:45:06] 📝 Joining tokens back to text... +[2025-03-26 14:45:15] 🔢 Processing batch 33... +[2025-03-26 14:45:15] 🪙 Tokenizing text... +[2025-03-26 14:45:36] 🚫 Removing stopwords... +[2025-03-26 14:45:58] 🌱 Applying stemming... +[2025-03-26 14:47:41] 📝 Joining tokens back to text... +[2025-03-26 14:47:50] 🔢 Processing batch 34... +[2025-03-26 14:47:51] 🪙 Tokenizing text... +[2025-03-26 14:48:12] 🚫 Removing stopwords... +[2025-03-26 14:48:39] 🌱 Applying stemming... +[2025-03-26 14:50:40] 📝 Joining tokens back to text... +[2025-03-26 14:50:50] 🔢 Processing batch 35... +[2025-03-26 14:50:50] 🪙 Tokenizing text... +[2025-03-26 14:51:12] 🚫 Removing stopwords... +[2025-03-26 14:51:41] 🌱 Applying stemming... +[2025-03-26 14:53:41] 📝 Joining tokens back to text... +[2025-03-26 14:53:51] 🔢 Processing batch 36... +[2025-03-26 14:53:52] 🪙 Tokenizing text... +[2025-03-26 14:54:10] 🚫 Removing stopwords... +[2025-03-26 14:54:33] 🌱 Applying stemming... +[2025-03-26 14:56:06] 📝 Joining tokens back to text... +[2025-03-26 14:56:15] 🔢 Processing batch 37... +[2025-03-26 14:56:16] 🪙 Tokenizing text... +[2025-03-26 14:56:36] 🚫 Removing stopwords... +[2025-03-26 14:57:03] 🌱 Applying stemming... +[2025-03-26 14:58:44] 📝 Joining tokens back to text... +[2025-03-26 14:58:54] 🔢 Processing batch 38... +[2025-03-26 14:58:55] 🪙 Tokenizing text... +[2025-03-26 14:59:29] 🚫 Removing stopwords... +[2025-03-26 14:59:59] 🌱 Applying stemming... +[2025-03-26 15:02:26] 📝 Joining tokens back to text... +[2025-03-26 15:02:39] 🔢 Processing batch 39... +[2025-03-26 15:02:40] 🪙 Tokenizing text... +[2025-03-26 15:03:07] 🚫 Removing stopwords... +[2025-03-26 15:03:40] 🌱 Applying stemming... +[2025-03-26 15:06:16] 📝 Joining tokens back to text... +[2025-03-26 15:06:27] 🔢 Processing batch 40... +[2025-03-26 15:06:28] 🪙 Tokenizing text... +[2025-03-26 15:06:49] 🚫 Removing stopwords... +[2025-03-26 15:07:14] 🌱 Applying stemming... +[2025-03-26 15:09:02] 📝 Joining tokens back to text... +[2025-03-26 15:09:12] 🔢 Processing batch 41... +[2025-03-26 15:09:13] 🪙 Tokenizing text... +[2025-03-26 15:09:37] 🚫 Removing stopwords... +[2025-03-26 15:10:05] 🌱 Applying stemming... +[2025-03-26 15:12:15] 📝 Joining tokens back to text... +[2025-03-26 15:12:26] 🔢 Processing batch 42... +[2025-03-26 15:12:27] 🪙 Tokenizing text... +[2025-03-26 15:12:50] 🚫 Removing stopwords... +[2025-03-26 15:13:20] 🌱 Applying stemming... +[2025-03-26 15:15:29] 📝 Joining tokens back to text... +[2025-03-26 15:15:40] 🔢 Processing batch 43... +[2025-03-26 15:15:41] 🪙 Tokenizing text... +[2025-03-26 15:16:08] 🚫 Removing stopwords... +[2025-03-26 15:16:39] 🌱 Applying stemming... +[2025-03-26 15:18:47] 📝 Joining tokens back to text... +[2025-03-26 15:19:01] 🔢 Processing batch 44... +[2025-03-26 15:19:02] 🪙 Tokenizing text... +[2025-03-26 15:19:28] 🚫 Removing stopwords... +[2025-03-26 15:19:57] 🌱 Applying stemming... +[2025-03-26 15:21:44] 📝 Joining tokens back to text... +[2025-03-26 15:21:54] 🔢 Processing batch 45... +[2025-03-26 15:21:54] 🪙 Tokenizing text... +[2025-03-26 15:22:14] 🚫 Removing stopwords... +[2025-03-26 15:22:36] 🌱 Applying stemming... +[2025-03-26 15:24:15] 📝 Joining tokens back to text... +[2025-03-26 15:24:28] 🔢 Processing batch 46... +[2025-03-26 15:24:28] 🪙 Tokenizing text... +[2025-03-26 15:24:54] 🚫 Removing stopwords... +[2025-03-26 15:25:24] 🌱 Applying stemming... +[2025-03-26 15:27:19] 📝 Joining tokens back to text... +[2025-03-26 15:27:29] 🔢 Processing batch 47... +[2025-03-26 15:27:29] 🪙 Tokenizing text... +[2025-03-26 15:27:51] 🚫 Removing stopwords... +[2025-03-26 15:28:16] 🌱 Applying stemming... +[2025-03-26 15:29:54] 📝 Joining tokens back to text... +[2025-03-26 15:30:06] 🔢 Processing batch 48... +[2025-03-26 15:30:06] 🪙 Tokenizing text... +[2025-03-26 15:30:28] 🚫 Removing stopwords... +[2025-03-26 15:30:56] 🌱 Applying stemming... +[2025-03-26 15:32:48] 📝 Joining tokens back to text... +[2025-03-26 15:32:59] 🔢 Processing batch 49... +[2025-03-26 15:33:00] 🪙 Tokenizing text... +[2025-03-26 15:33:25] 🚫 Removing stopwords... +[2025-03-26 15:33:52] 🌱 Applying stemming... +[2025-03-26 15:35:40] 📝 Joining tokens back to text... +[2025-03-26 15:35:49] 🔢 Processing batch 50... +[2025-03-26 15:35:49] 🪙 Tokenizing text... +[2025-03-26 15:36:09] 🚫 Removing stopwords... +[2025-03-26 15:36:33] 🌱 Applying stemming... +[2025-03-26 15:38:12] 📝 Joining tokens back to text... +[2025-03-26 15:38:22] 🔢 Processing batch 51... +[2025-03-26 15:38:22] 🪙 Tokenizing text... +[2025-03-26 15:38:42] 🚫 Removing stopwords... +[2025-03-26 15:39:09] 🌱 Applying stemming... +[2025-03-26 15:40:52] 📝 Joining tokens back to text... +[2025-03-26 15:41:02] 🔢 Processing batch 52... +[2025-03-26 15:41:03] 🪙 Tokenizing text... +[2025-03-26 15:41:23] 🚫 Removing stopwords... +[2025-03-26 15:41:49] 🌱 Applying stemming... +[2025-03-26 15:43:36] 📝 Joining tokens back to text... +[2025-03-26 15:43:46] 🔢 Processing batch 53... +[2025-03-26 15:43:47] 🪙 Tokenizing text... +[2025-03-26 15:44:08] 🚫 Removing stopwords... +[2025-03-26 15:44:37] 🌱 Applying stemming... +[2025-03-26 15:46:41] 📝 Joining tokens back to text... +[2025-03-26 15:46:52] 🔢 Processing batch 54... +[2025-03-26 15:46:52] 🪙 Tokenizing text... +[2025-03-26 15:47:14] 🚫 Removing stopwords... +[2025-03-26 15:47:41] 🌱 Applying stemming... +[2025-03-26 15:49:43] 📝 Joining tokens back to text... +[2025-03-26 15:49:54] 🔢 Processing batch 55... +[2025-03-26 15:49:54] 🪙 Tokenizing text... +[2025-03-26 15:50:18] 🚫 Removing stopwords... +[2025-03-26 15:50:48] 🌱 Applying stemming... +[2025-03-26 15:52:41] 📝 Joining tokens back to text... +[2025-03-26 15:52:51] 🔢 Processing batch 56... +[2025-03-26 15:52:51] 🪙 Tokenizing text... +[2025-03-26 15:53:14] 🚫 Removing stopwords... +[2025-03-26 15:53:40] 🌱 Applying stemming... +[2025-03-26 15:55:33] 📝 Joining tokens back to text... +[2025-03-26 15:55:44] 🔢 Processing batch 57... +[2025-03-26 15:55:44] 🪙 Tokenizing text... +[2025-03-26 15:56:07] 🚫 Removing stopwords... +[2025-03-26 15:56:36] 🌱 Applying stemming... +[2025-03-26 15:58:33] 📝 Joining tokens back to text... +[2025-03-26 15:58:44] 🔢 Processing batch 58... +[2025-03-26 15:58:45] 🪙 Tokenizing text... +[2025-03-26 15:59:08] 🚫 Removing stopwords... +[2025-03-26 15:59:38] 🌱 Applying stemming... +[2025-03-26 16:01:33] 📝 Joining tokens back to text... +[2025-03-26 16:01:43] 🔢 Processing batch 59... +[2025-03-26 16:01:43] 🪙 Tokenizing text... +[2025-03-26 16:02:07] 🚫 Removing stopwords... +[2025-03-26 16:02:35] 🌱 Applying stemming... +[2025-03-26 16:04:53] 📝 Joining tokens back to text... +[2025-03-26 16:05:05] 🔢 Processing batch 60... +[2025-03-26 16:05:05] 🪙 Tokenizing text... +[2025-03-26 16:05:23] 🚫 Removing stopwords... +[2025-03-26 16:05:47] 🌱 Applying stemming... +[2025-03-26 16:07:20] 📝 Joining tokens back to text... +[2025-03-26 16:07:29] 🔢 Processing batch 61... +[2025-03-26 16:07:29] 🪙 Tokenizing text... +[2025-03-26 16:07:52] 🚫 Removing stopwords... +[2025-03-26 16:08:12] 🌱 Applying stemming... +[2025-03-26 16:09:52] 📝 Joining tokens back to text... +[2025-03-26 16:10:01] 🔢 Processing batch 62... +[2025-03-26 16:10:01] 🪙 Tokenizing text... +[2025-03-26 16:10:19] 🚫 Removing stopwords... +[2025-03-26 16:10:40] 🌱 Applying stemming... +[2025-03-26 16:12:10] 📝 Joining tokens back to text... +[2025-03-26 16:12:18] 🔢 Processing batch 63... +[2025-03-26 16:12:19] 🪙 Tokenizing text... +[2025-03-26 16:12:35] 🚫 Removing stopwords... +[2025-03-26 16:12:56] 🌱 Applying stemming... +[2025-03-26 16:14:25] 📝 Joining tokens back to text... +[2025-03-26 16:14:35] 🔢 Processing batch 64... +[2025-03-26 16:14:36] 🪙 Tokenizing text... +[2025-03-26 16:15:00] 🚫 Removing stopwords... +[2025-03-26 16:15:29] 🌱 Applying stemming... +[2025-03-26 16:17:47] 📝 Joining tokens back to text... +[2025-03-26 16:17:58] 🔢 Processing batch 65... +[2025-03-26 16:17:58] 🪙 Tokenizing text... +[2025-03-26 16:18:20] 🚫 Removing stopwords... +[2025-03-26 16:18:49] 🌱 Applying stemming... +[2025-03-26 16:20:46] 📝 Joining tokens back to text... +[2025-03-26 16:20:56] 🔢 Processing batch 66... +[2025-03-26 16:20:57] 🪙 Tokenizing text... +[2025-03-26 16:21:20] 🚫 Removing stopwords... +[2025-03-26 16:21:52] 🌱 Applying stemming... +[2025-03-26 16:23:53] 📝 Joining tokens back to text... +[2025-03-26 16:24:05] 🔢 Processing batch 67... +[2025-03-26 16:24:05] 🪙 Tokenizing text... +[2025-03-26 16:24:29] 🚫 Removing stopwords... +[2025-03-26 16:24:51] 🌱 Applying stemming... +[2025-03-26 16:26:26] 📝 Joining tokens back to text... +[2025-03-26 16:26:35] 🔢 Processing batch 68... +[2025-03-26 16:26:35] 🪙 Tokenizing text... +[2025-03-26 16:26:56] 🚫 Removing stopwords... +[2025-03-26 16:27:22] 🌱 Applying stemming... +[2025-03-26 16:29:06] 📝 Joining tokens back to text... +[2025-03-26 16:29:16] 🔢 Processing batch 69... +[2025-03-26 16:29:17] 🪙 Tokenizing text... +[2025-03-26 16:29:35] 🚫 Removing stopwords... +[2025-03-26 16:30:00] 🌱 Applying stemming... +[2025-03-26 16:31:41] 📝 Joining tokens back to text... +[2025-03-26 16:31:51] 🔢 Processing batch 70... +[2025-03-26 16:31:51] 🪙 Tokenizing text... +[2025-03-26 16:32:11] 🚫 Removing stopwords... +[2025-03-26 16:32:35] 🌱 Applying stemming... +[2025-03-26 16:34:13] 📝 Joining tokens back to text... +[2025-03-26 16:34:23] 🔢 Processing batch 71... +[2025-03-26 16:34:23] 🪙 Tokenizing text... +[2025-03-26 16:34:45] 🚫 Removing stopwords... +[2025-03-26 16:35:13] 🌱 Applying stemming... +[2025-03-26 16:36:58] 📝 Joining tokens back to text... +[2025-03-26 16:37:08] 🔢 Processing batch 72... +[2025-03-26 16:37:08] 🪙 Tokenizing text... +[2025-03-26 16:37:33] 🚫 Removing stopwords... +[2025-03-26 16:37:59] 🌱 Applying stemming... +[2025-03-26 16:39:35] 📝 Joining tokens back to text... +[2025-03-26 16:39:45] 🔢 Processing batch 73... +[2025-03-26 16:39:45] 🪙 Tokenizing text... +[2025-03-26 16:40:06] 🚫 Removing stopwords... +[2025-03-26 16:40:32] 🌱 Applying stemming... +[2025-03-26 16:42:13] 📝 Joining tokens back to text... +[2025-03-26 16:42:24] 🔢 Processing batch 74... +[2025-03-26 16:42:25] 🪙 Tokenizing text... +[2025-03-26 16:42:46] 🚫 Removing stopwords... +[2025-03-26 16:43:12] 🌱 Applying stemming... +[2025-03-26 16:44:51] 📝 Joining tokens back to text... +[2025-03-26 16:45:01] 🔢 Processing batch 75... +[2025-03-26 16:45:01] 🪙 Tokenizing text... +[2025-03-26 16:45:23] 🚫 Removing stopwords... +[2025-03-26 16:45:48] 🌱 Applying stemming... +[2025-03-26 16:47:28] 📝 Joining tokens back to text... +[2025-03-26 16:47:39] 🔢 Processing batch 76... +[2025-03-26 16:47:39] 🪙 Tokenizing text... +[2025-03-26 16:48:00] 🚫 Removing stopwords... +[2025-03-26 16:48:31] 🌱 Applying stemming... +[2025-03-26 16:50:15] 📝 Joining tokens back to text... +[2025-03-26 16:50:25] 🔢 Processing batch 77... +[2025-03-26 16:50:25] 🪙 Tokenizing text... +[2025-03-26 16:50:52] 🚫 Removing stopwords... +[2025-03-26 16:51:18] 🌱 Applying stemming... +[2025-03-26 16:53:01] 📝 Joining tokens back to text... +[2025-03-26 16:53:11] 🔢 Processing batch 78... +[2025-03-26 16:53:11] 🪙 Tokenizing text... +[2025-03-26 16:53:33] 🚫 Removing stopwords... +[2025-03-26 16:54:00] 🌱 Applying stemming... +[2025-03-26 16:55:40] 📝 Joining tokens back to text... +[2025-03-26 16:55:50] 🔢 Processing batch 79... +[2025-03-26 16:55:51] 🪙 Tokenizing text... +[2025-03-26 16:56:11] 🚫 Removing stopwords... +[2025-03-26 16:56:38] 🌱 Applying stemming... +[2025-03-26 16:58:24] 📝 Joining tokens back to text... +[2025-03-26 16:58:34] 🔢 Processing batch 80... +[2025-03-26 16:58:35] 🪙 Tokenizing text... +[2025-03-26 16:58:55] 🚫 Removing stopwords... +[2025-03-26 16:59:22] 🌱 Applying stemming... +[2025-03-26 17:01:05] 📝 Joining tokens back to text... +[2025-03-26 17:01:15] 🔢 Processing batch 81... +[2025-03-26 17:01:16] 🪙 Tokenizing text... +[2025-03-26 17:01:37] 🚫 Removing stopwords... +[2025-03-26 17:02:05] 🌱 Applying stemming... +[2025-03-26 17:03:55] 📝 Joining tokens back to text... +[2025-03-26 17:04:06] 🔢 Processing batch 82... +[2025-03-26 17:04:07] 🪙 Tokenizing text... +[2025-03-26 17:04:34] 🚫 Removing stopwords... +[2025-03-26 17:05:08] 🌱 Applying stemming... +[2025-03-26 17:07:03] 📝 Joining tokens back to text... +[2025-03-26 17:07:14] 🔢 Processing batch 83... +[2025-03-26 17:07:15] 🪙 Tokenizing text... +[2025-03-26 17:07:40] 🚫 Removing stopwords... +[2025-03-26 17:08:08] 🌱 Applying stemming... +[2025-03-26 17:10:06] 📝 Joining tokens back to text... +[2025-03-26 17:10:18] 🔢 Processing batch 84... +[2025-03-26 17:10:19] 🪙 Tokenizing text... +[2025-03-26 17:10:46] 🚫 Removing stopwords... +[2025-03-26 17:11:17] 🌱 Applying stemming... +[2025-03-26 17:13:35] 📝 Joining tokens back to text... +[2025-03-26 17:13:48] 🔢 Processing batch 85... +[2025-03-26 17:13:49] 🪙 Tokenizing text... +[2025-03-26 17:14:19] 🚫 Removing stopwords... +[2025-03-26 17:14:53] 🌱 Applying stemming... +[2025-03-26 17:17:16] 📝 Joining tokens back to text... +[2025-03-26 17:17:28] 🔢 Processing batch 86... +[2025-03-26 17:17:28] 🪙 Tokenizing text... +[2025-03-26 17:17:43] 🚫 Removing stopwords... +[2025-03-26 17:17:57] 🌱 Applying stemming... +[2025-03-26 17:18:44] 📝 Joining tokens back to text... +[2025-03-26 17:26:42] 💾 Processed data saved to '../data/processed_fakenews.parquet' and '../data/processed_fakenews.csv' +[2025-03-26 17:26:42] 📊 Total words (the raw number of all words in the text, including duplicates): 3,307,195,209 +⏮️ Before stopword removal: 3,307,195,209 +🔻 After stopword removal: 1,744,854,554 (-47.24%) +[2025-03-26 17:26:42] 🫆 Vocabulary (the number of distinct words in the text, ignoring duplicates): +⏮️ Before stemming: 2,767,790 +🔻 After stemming: 2,415,853 (-12.71%) +[2025-03-26 17:26:42] 📏 Avg. length of retained words: +⏮️ After stopword removal: 6.34 +🔻 After stemming: 5.41 (-14.74%) +[2025-03-26 17:27:57] 📌 Top 10 words: +🔝 Before preprocessing: [('the', 220550046), ('of', 106480206), ('to', 106216779), ('and', 96588260), ('a', 83333254), ('in', 73132963), ('that', 45117566), ('is', 42309500), ('for', 36971267), ('on', 29213684)] +🔝 After stopword removal: [('new', 8599601), ('one', 8315754), ('would', 8145653), ('said', 7354978), ('people', 5996078), ('also', 5692918), ('like', 5565201), ('even', 4472256), ('us', 4463611), ('could', 4114863)] +🔝 After stemming: [('new', 8600723), ('one', 8599171), ('would', 8145683), ('said', 7355057), ('like', 6797658), ('state', 6085922), ('peopl', 6060566), ('use', 5697023), ('also', 5692967), ('time', 5630219)] diff --git a/archives/fnc1a.png b/archives/fnc1a.png new file mode 100644 index 0000000..12a3b00 Binary files /dev/null and b/archives/fnc1a.png differ diff --git a/archives/fnc1b.log b/archives/fnc1b.log new file mode 100644 index 0000000..fdba991 --- /dev/null +++ b/archives/fnc1b.log @@ -0,0 +1,6 @@ +nohup: ignoring input +🔍 Loading data from Parquet file at 'processed_fakenews.parquet' +🔍 Dataset contains 8,528,956 rows. +📉 Reducing dataset from 8,528,956 to 852,895 rows... +✅ Sample contains 852,895 rows (expected 852,895 rows) +💾 Sample saved to 'sampled_fakenews.csv' and 'sampled_fakenews.parquet'. diff --git a/src/fnc1a.py b/src/fnc1a.py index c81d7b8..3527d4c 100644 --- a/src/fnc1a.py +++ b/src/fnc1a.py @@ -1,27 +1,205 @@ -import random +import numpy as np import pandas as pd +import spacy +import nltk +import matplotlib.pyplot as plt +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer +from collections import Counter +from pandarallel import pandarallel +import multiprocessing import os -import subprocess +import pyarrow.parquet as pq +import pyarrow as pa +from datetime import datetime -data_path = "./FNC/news_cleaned_2018_02_13.csv" -sample_path = "sampled_news" -SAMPLE_FRACTION = 0.001 # Use 0.001 for 0.1% of the dataset +# Print log messages with timestamp +def print_log(msg): + print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}") -if not os.path.exists(data_path): - print(f"❌ Error: File not found at {data_path}") +# Download NLTK stopwords +nltk.download('stopwords') + +# Load spaCy model +print_log("📚 Loading spaCy model...") +try: + nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) +except OSError: + import subprocess + print_log("⬇️ Model not found. Downloading...") + subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) + nlp = spacy.load("en_core_web_sm") +print_log("📖 spaCy model loaded.") + +# Paths +csv_path = "../data/news_cleaned_2018_02_13.csv" +parquet_path = "../data/news_cleaned_2018_02_13.parquet" +output_parquet = "../data/processed_fakenews.parquet" +output_csv = "../data/processed_fakenews.csv" + +# Convert CSV to Parquet if needed +if os.path.exists(parquet_path): + data_path = parquet_path +elif os.path.exists(csv_path): + print_log("🔄 Converting CSV to Parquet...") + + chunksize=1e5 + pqwriter = None + for i, df in enumerate(pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip", chunksize=chunksize, usecols=["id", "content", "type"])): + table = pa.Table.from_pandas(df) + # If it's the first chunk, create a new parquet writer + if i == 0: + pqwriter = pq.ParquetWriter(parquet_path, table.schema) + pqwriter.write_table(table) + + if pqwriter: + pqwriter.close() + + print_log("✅ Conversion complete.") + data_path = parquet_path +else: + print_log("❌ Error: No dataset found.") exit() -# Get total rows. Only works on Unix-like systems due to `wc` command -total_rows = int(subprocess.check_output(["wc", "-l", data_path]).split()[0]) - 1 -print(f"🔍 Dataset contains {total_rows:,} rows.") +# Stopwords & Stemmer +stop_words = set(stopwords.words("english")) +stemmer = PorterStemmer() -sample_size = int(total_rows * SAMPLE_FRACTION) -print(f"📉 Reducing dataset to {sample_size:,} rows...") +# Initialize parallel processing +# !WARNING: This will use all available CPU cores, might kill host machine +# Set progress_bar=True to see a progress bar +pandarallel.initialize(nb_workers=max(1, int(multiprocessing.cpu_count())), progress_bar=False) -# Read only a sample -skip_rows = sorted(random.sample(range(1, total_rows + 1), total_rows - sample_size)) -df_sample = pd.read_csv(data_path, skiprows=skip_rows, lineterminator="\n", on_bad_lines="skip") -df_sample.to_csv(f"{sample_path}.csv", index=False) -df_sample.to_parquet(f"{sample_path}.parquet", index=False) +batch_size = 100000 +parquet_file = pq.ParquetFile(data_path) -print("✅ Sample saved to sampled_news.csv and sampled_news.parquet.") +processed_chunks = [] +vocab_before = Counter() +vocab_after_stopwords = Counter() +vocab_after_stemming = Counter() +total_words_before = 0 +total_words_after_stopwords = 0 +total_words_after_stemming = 0 +total_chars_after_stopwords = 0 +total_chars_after_stemming = 0 + +# Process text in batches +print_log("🧮 Processing text in batches...") +batch_num = 0 +for batch in parquet_file.iter_batches(batch_size): + print_log(f"🔢 Processing batch {batch_num + 1}...") + chunk = batch.to_pandas() + chunk = chunk.dropna(subset=["content"]).astype({'content': 'string'}) + + # Tokenize, remove stopwords, and apply stemming + print_log("🪙 Tokenizing text...") + chunk_tokens = chunk["content"].parallel_apply(lambda text: [word.lower() for word in text.split() if word.isalpha()]) + for tokens in chunk_tokens: + vocab_before.update(tokens) + total_words_before += len(tokens) + + print_log("🚫 Removing stopwords...") + chunk_no_stopwords = chunk_tokens.parallel_apply(lambda tokens: [word for word in tokens if word not in stop_words]) + for tokens in chunk_no_stopwords: + vocab_after_stopwords.update(tokens) + total_words_after_stopwords += len(tokens) + total_chars_after_stopwords += sum(len(word) for word in tokens) + + print_log("🌱 Applying stemming...") + chunk_stemmed = chunk_no_stopwords.parallel_apply(lambda tokens: [stemmer.stem(word) for word in tokens]) + for tokens in chunk_stemmed: + vocab_after_stemming.update(tokens) + total_words_after_stemming += len(tokens) + total_chars_after_stemming += sum(len(word) for word in tokens) + + # Join tokens back to text + print_log("📝 Joining tokens back to text...") + chunk["processed_text"] = chunk_stemmed.parallel_apply(lambda tokens: ' '.join(tokens)) + processed_chunks.append(chunk[["id", "processed_text", "type"]]) + batch_num += 1 + +# Save processed data +final_df = pd.concat(processed_chunks, ignore_index=True) +final_df.to_parquet(output_parquet, index=False) +final_df.to_csv(output_csv, index=False) + +print_log(f"💾 Processed data saved to '{output_parquet}' and '{output_csv}'") + +# Print statistics +total_vocab_before = len(vocab_before) +total_vocab_after_stopwords = len(vocab_after_stopwords) +total_vocab_after_stemming = len(vocab_after_stemming) + +total_stopword_reduction = (total_words_before - total_words_after_stopwords) / total_words_before * 100 +print_log(f"📊 Total words (the raw number of all words in the text, including duplicates): {total_words_before:,}") +print(f"⏮️ Before stopword removal: {total_words_before:,}") +print(f"🔻 After stopword removal: {total_words_after_stopwords:,} (-{total_stopword_reduction:.2f}%)") + +vocab_stemming_reduction = (total_vocab_after_stopwords - total_vocab_after_stemming) / total_vocab_after_stopwords * 100 +print_log(f"🫆 Vocabulary (the number of distinct words in the text, ignoring duplicates):") +print(f"⏮️ Before stemming: {total_vocab_before:,}") +print(f"🔻 After stemming: {total_vocab_after_stemming:,} (-{vocab_stemming_reduction:.2f}%)") + +avg_chars_after_stopwords = total_chars_after_stopwords / total_words_after_stopwords +avg_chars_after_stemming = total_chars_after_stemming / total_words_after_stemming +avg_chars_reduction = (avg_chars_after_stopwords - avg_chars_after_stemming) / avg_chars_after_stopwords * 100 +print_log(f"📏 Avg. length of retained words:") +print(f"⏮️ After stopword removal: {avg_chars_after_stopwords:.2f}") +print(f"🔻 After stemming: {avg_chars_after_stemming:.2f} (-{avg_chars_reduction:.2f}%)") + +# Get most frequent words before and after stopword removal & stemming +def get_most_frequent_words(vocab, top_n=10): + return vocab.most_common(top_n) + +top_words_before = get_most_frequent_words(vocab_before) +top_words_after_stopwords = get_most_frequent_words(vocab_after_stopwords) +top_words_after_stemming = get_most_frequent_words(vocab_after_stemming) + +print_log("📌 Top 10 words:") +print("🔝 Before preprocessing:", top_words_before) +print("🔝 After stopword removal:", top_words_after_stopwords) +print("🔝 After stemming:", top_words_after_stemming) + +def plot_word_frequencies(vocab_before, vocab_after_stopwords, vocab_after_stemming, top_n=10000): + plt.figure(figsize=(12, 7)) + + freq_before = [freq for _, freq in vocab_before.most_common(top_n)] + freq_after_stopwords = [freq for _, freq in vocab_after_stopwords.most_common(top_n)] + freq_after_stemming = [freq for _, freq in vocab_after_stemming.most_common(top_n)] + + plt.loglog(range(1, len(freq_before)+1), freq_before, + label='Raw Text', color='royalblue', alpha=0.8, linewidth=2) + plt.loglog(range(1, len(freq_after_stopwords)+1), freq_after_stopwords, + label='After Stopword Removal', color='orange', alpha=0.8, linewidth=2) + plt.loglog(range(1, len(freq_after_stemming)+1), freq_after_stemming, + label='After Stemming', color='green', alpha=0.8, linewidth=2) + + # Add Zipf's law reference line + zipf_x = np.array(range(1, top_n+1)) + zipf_y = freq_before[0] / zipf_x + plt.plot(zipf_x, zipf_y, 'r--', label="Zipf's Law", alpha=0.5) + + top_words = [word for word, _ in vocab_before.most_common(5)] + for rank, word in enumerate(top_words, 1): + freq = vocab_before[word] + plt.annotate(word, xy=(rank, freq), xytext=(rank*1.5, freq*1.5), + arrowprops=dict(facecolor='black', shrink=0.05, width=1, headwidth=4), + fontsize=9, bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", lw=1)) + + plt.title('Word Frequency Distribution (Log-Log Scale)', fontsize=14, pad=20) + plt.xlabel('Word Rank (Log Scale)', fontsize=12) + plt.ylabel('Frequency (Log Scale)', fontsize=12) + plt.grid(True, which="both", ls="-", alpha=0.2) + plt.legend(fontsize=11) + + plt.text(0.02, 0.02, + "• Steep drop at left = Stopwords dominate\n" + "• Flatter curve after processing = Better balance\n" + "• Close to Zipf's line = Natural language pattern", + transform=plt.gca().transAxes, fontsize=10, + bbox=dict(boxstyle="round", fc="white", ec="gray", pad=0.4)) + + plt.tight_layout() + plt.show() + +plot_word_frequencies(vocab_before, vocab_after_stopwords, vocab_after_stemming) \ No newline at end of file diff --git a/src/fnc1b.py b/src/fnc1b.py index d8d0164..796fecd 100644 --- a/src/fnc1b.py +++ b/src/fnc1b.py @@ -1,182 +1,75 @@ -import numpy as np +import random import pandas as pd -import spacy -import nltk -import matplotlib.pyplot as plt -from nltk.corpus import stopwords -from nltk.stem import PorterStemmer -from collections import Counter -from pandarallel import pandarallel -import multiprocessing import os +import subprocess +import pyarrow as pa import pyarrow.parquet as pq -# Download NLTK stopwords -nltk.download('stopwords') +parquet_path = "../data/processed_fakenews.parquet" +csv_path = "../data/processed_fakenews.csv" +sample_path = "../data/sampled_fakenews" +SAMPLE_FRACTION = 0.1 +RANDOM_SEED = 42 # For reproducibility -# Paths -csv_path = "sampled_news.csv" -parquet_path = "sampled_news_sm.parquet" -output_parquet = "processed_fakenews.parquet" -output_csv = "processed_fakenews.csv" +def get_sample_size(total_rows, log=False): + sample_size = int(total_rows * SAMPLE_FRACTION) + if log: + print(f"📉 Reducing dataset from {total_rows:,} to {sample_size:,} rows...") + return sample_size -# Convert CSV to Parquet if needed +def sample_dataframe(df, total_rows): + sample_size = get_sample_size(total_rows=total_rows, log=True) + return df.sample(n=sample_size, random_state=RANDOM_SEED) + +# Try to load from Parquet first, fall back to CSV if not available if os.path.exists(parquet_path): - data_path = parquet_path + print(f"🔍 Loading data from Parquet file at '{parquet_path}'") + try: + # Read metadata to get row count without loading entire file + parquet_file = pq.ParquetFile(parquet_path) + total_rows = parquet_file.metadata.num_rows + print(f"🔍 Dataset contains {total_rows:,} rows.") + + # Read and sample the data + df_sample = sample_dataframe(pd.read_parquet(parquet_path), total_rows) + + except Exception as e: + print(f"❌ Error reading Parquet file: {e}") + print("🔄 Falling back to CSV...") + if not os.path.exists(csv_path): + print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}") + exit() + + # Get total rows from CSV (Unix-like systems only due to `wc`) + total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1 + print(f"🔍 Dataset contains {total_rows:,} rows.") + + # Read and sample the data + df_sample = sample_dataframe( + pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"), + total_rows + ) + elif os.path.exists(csv_path): - print("🔄 Converting CSV to Parquet...") - df = pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip", usecols=["id", "content", "type"]) - df.to_parquet(parquet_path, index=False) - print("✅ Conversion complete.") - data_path = parquet_path + print(f"🔍 Parquet file not found, loading from CSV at {csv_path}") + # Get total rows from CSV (Unix-like systems only due to `wc`) + total_rows = int(subprocess.check_output(["wc", "-l", csv_path]).split()[0]) - 1 + print(f"🔍 Dataset contains {total_rows:,} rows.") + + # Read and sample the data + df_sample = sample_dataframe( + pd.read_csv(csv_path, lineterminator="\n", on_bad_lines="skip"), + total_rows + ) else: - print("❌ Error: No dataset found.") + print(f"❌ Error: Neither Parquet nor CSV file found at {parquet_path} or {csv_path}") exit() -# Load spaCy model -print("📚 Loading spaCy model...") -try: - nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) -except OSError: - import subprocess - print("⬇️ Model not found. Downloading...") - subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) - nlp = spacy.load("en_core_web_sm") -print("📖 spaCy model loaded.") +# Verify the sample size +print(f"✅ Sample contains {len(df_sample):,} rows (expected {get_sample_size(total_rows=total_rows):,} rows)") -# Stopwords & Stemmer -stop_words = set(stopwords.words("english")) -stemmer = PorterStemmer() +# Save the sample in both formats +df_sample.to_csv(f"{sample_path}.csv", index=False) +df_sample.to_parquet(f"{sample_path}.parquet", index=False) -# Initialize parallel processing -pandarallel.initialize(nb_workers=max(1, int(multiprocessing.cpu_count() / 2)), progress_bar=True) - -batch_size = 100000 -parquet_file = pq.ParquetFile(data_path) - -processed_chunks = [] -vocab_before = Counter() -vocab_after_stopwords = Counter() -vocab_after_stemming = Counter() -total_words_before = 0 -total_words_after_stopwords = 0 -total_words_after_stemming = 0 - -total_chars_after_stopwords = 0 -total_chars_after_stemming = 0 - -print("🧮 Processing text in batches...") -batch_num = 0 -for batch in parquet_file.iter_batches(batch_size): - print(f"🔢 Processing batch {batch_num + 1}...") - chunk = batch.to_pandas() - chunk = chunk.dropna(subset=["content"]).astype({'content': 'string'}) - - print("🪙 Tokenizing text...") - chunk_tokens = chunk["content"].parallel_apply(lambda text: [word.lower() for word in text.split() if word.isalpha()]) - for tokens in chunk_tokens: - vocab_before.update(tokens) - total_words_before += len(tokens) - - print("🚫 Removing stopwords...") - chunk_no_stopwords = chunk_tokens.parallel_apply(lambda tokens: [word for word in tokens if word not in stop_words]) - for tokens in chunk_no_stopwords: - vocab_after_stopwords.update(tokens) - total_words_after_stopwords += len(tokens) - total_chars_after_stopwords += sum(len(word) for word in tokens) - - print("🌱 Applying stemming...") - chunk_stemmed = chunk_no_stopwords.parallel_apply(lambda tokens: [stemmer.stem(word) for word in tokens]) - for tokens in chunk_stemmed: - vocab_after_stemming.update(tokens) - total_words_after_stemming += len(tokens) - total_chars_after_stemming += sum(len(word) for word in tokens) - - print("📝 Joining tokens back to text...") - chunk["processed_text"] = chunk_stemmed.parallel_apply(lambda tokens: ' '.join(tokens)) - processed_chunks.append(chunk[["id", "processed_text", "type"]]) - batch_num += 1 - -# Save processed data -final_df = pd.concat(processed_chunks, ignore_index=True) -final_df.to_parquet(output_parquet, index=False) -final_df.to_csv(output_csv, index=False) - -print(f"💾 Processed data saved to '{output_parquet}' and '{output_csv}'") - -total_vocab_before = len(vocab_before) -total_vocab_after_stopwords = len(vocab_after_stopwords) -total_vocab_after_stemming = len(vocab_after_stemming) - -total_stopword_reduction = (total_words_before - total_words_after_stopwords) / total_words_before * 100 -print(f"📊 Total words (the raw number of all words in the text, including duplicates): {total_words_before:,}") -print(f"⏮️ Before stopword removal: {total_words_before:,}") -print(f"🔻 After stopword removal: {total_words_after_stopwords:,} (-{total_stopword_reduction:.2f}%)") - -vocab_stemming_reduction = (total_vocab_after_stopwords - total_vocab_after_stemming) / total_vocab_after_stopwords * 100 -print(f"🫆 Vocabulary (the number of distinct words in the text, ignoring duplicates):") -print(f"⏮️ Before stemming: {total_vocab_before:,}") -print(f"🔻 After stemming: {total_vocab_after_stemming:,} (-{vocab_stemming_reduction:.2f}%)") - -avg_chars_after_stopwords = total_chars_after_stopwords / total_words_after_stopwords -avg_chars_after_stemming = total_chars_after_stemming / total_words_after_stemming -avg_chars_reduction = (avg_chars_after_stopwords - avg_chars_after_stemming) / avg_chars_after_stopwords * 100 -print(f"📏 Avg. length of retained words:") -print(f"⏮️ After stopword removal: {avg_chars_after_stopwords:.2f}") -print(f"🔻 After stemming: {avg_chars_after_stemming:.2f} (-{avg_chars_reduction:.2f}%)") - -# Get most frequent words before and after stopword removal & stemming -def get_most_frequent_words(vocab, top_n=10): - return vocab.most_common(top_n) - -top_words_before = get_most_frequent_words(vocab_before) -top_words_after_stopwords = get_most_frequent_words(vocab_after_stopwords) -top_words_after_stemming = get_most_frequent_words(vocab_after_stemming) - -print("📌 Top 10 words before preprocessing:", top_words_before) -print("📌 Top 10 words after stopword removal:", top_words_after_stopwords) -print("📌 Top 10 words after stemming:", top_words_after_stemming) - -def plot_word_frequencies(vocab_before, vocab_after_stopwords, vocab_after_stemming, top_n=10000): - plt.figure(figsize=(12, 7)) - - freq_before = [freq for _, freq in vocab_before.most_common(top_n)] - freq_after_stopwords = [freq for _, freq in vocab_after_stopwords.most_common(top_n)] - freq_after_stemming = [freq for _, freq in vocab_after_stemming.most_common(top_n)] - - plt.loglog(range(1, len(freq_before)+1), freq_before, - label='Raw Text', color='royalblue', alpha=0.8, linewidth=2) - plt.loglog(range(1, len(freq_after_stopwords)+1), freq_after_stopwords, - label='After Stopword Removal', color='orange', alpha=0.8, linewidth=2) - plt.loglog(range(1, len(freq_after_stemming)+1), freq_after_stemming, - label='After Stemming', color='green', alpha=0.8, linewidth=2) - - # Add Zipf's law reference line - zipf_x = np.array(range(1, top_n+1)) - zipf_y = freq_before[0] / zipf_x - plt.plot(zipf_x, zipf_y, 'r--', label="Zipf's Law", alpha=0.5) - - top_words = [word for word, _ in vocab_before.most_common(5)] - for rank, word in enumerate(top_words, 1): - freq = vocab_before[word] - plt.annotate(word, xy=(rank, freq), xytext=(rank*1.5, freq*1.5), - arrowprops=dict(facecolor='black', shrink=0.05, width=1, headwidth=4), - fontsize=9, bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", lw=1)) - - plt.title('Word Frequency Distribution (Log-Log Scale)', fontsize=14, pad=20) - plt.xlabel('Word Rank (Log Scale)', fontsize=12) - plt.ylabel('Frequency (Log Scale)', fontsize=12) - plt.grid(True, which="both", ls="-", alpha=0.2) - plt.legend(fontsize=11) - - plt.text(0.02, 0.02, - "• Steep drop at left = Stopwords dominate\n" - "• Flatter curve after processing = Better balance\n" - "• Close to Zipf's line = Natural language pattern", - transform=plt.gca().transAxes, fontsize=10, - bbox=dict(boxstyle="round", fc="white", ec="gray", pad=0.4)) - - plt.tight_layout() - plt.show() - -plot_word_frequencies(vocab_before, vocab_after_stopwords, vocab_after_stemming) \ No newline at end of file +print(f"💾 Sample saved to '{sample_path}.csv' and '{sample_path}.parquet'.") \ No newline at end of file diff --git a/src/parquet_validator.py b/src/parquet_validator.py new file mode 100644 index 0000000..d4adc4b --- /dev/null +++ b/src/parquet_validator.py @@ -0,0 +1,19 @@ +# Validate if a parquet file is valid or not, and print out some information about the file. +import pyarrow.parquet as pq + +def validate_parquet_file(file_path): + parquet_file = None + try: + parquet_file = pq.ParquetFile(file_path) + print(f"✅ The file '{file_path}' is a valid Parquet file.") + except Exception as e: + print(f"❌ The file '{file_path}' is not a valid Parquet file.") + print(f"Error: {e}") + + print(f" - Column Names: {parquet_file.schema}") + print(f" - File Metadata: {parquet_file.metadata}") + +# Example usage: +validate_parquet_file("../data/processed_fakenews.parquet") +validate_parquet_file("../data/sampled_fakenews.parquet") +