diff --git a/Pipfile b/Pipfile index 3266d0b..cb39b78 100644 --- a/Pipfile +++ b/Pipfile @@ -9,6 +9,7 @@ loguru = "*" gogettr = "*" requests = "*" bs4 = "*" +lxml = "*" dateparser = "*" boto3 = "*" ffmpeg-python = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 5db0836..6875bcd 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "3f0312747083221d57ac2c4ce96786a6ead34aa3a3a3519fed4ea4382f672633" + "sha256": "13cc50755a59b2cd8bf93049a9a695aa27d35b973b0bdc154af5d21ce48fd57f" }, "pipfile-spec": 6, "requires": { @@ -21,7 +21,6 @@ "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30", "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693" ], - "markers": "python_version >= '3.6'", "version": "==4.11.1" }, "blis": { @@ -50,19 +49,18 @@ }, "boto3": { "hashes": [ - "sha256:927b5e8e2decad746e6c32bb81f15c2ea9ab4398286134d21f6742493eb893f6", - "sha256:e3c10adc7be890b147568a4162d9cafb876f11f87460c4a0dc90742d6d4ebe7c" + "sha256:1c13d555172cf88eb645af2429e4a7f42be85e365d6ffc110c952a556d3f8808", + "sha256:4af6a8bc5110b5f9d2fbd00a3c110e4c4cc36fae78d05afa354831f5789e363b" ], "index": "pypi", - "version": "==1.24.2" + "version": "==1.24.6" }, "botocore": { "hashes": [ - "sha256:131f71fe16ef84f9e0e72c54d2e230a6d8e79dd3947f507259a129649649a35d", - "sha256:b7cdd4f4a6395a084a381a7d2a25b177e6de5f8a4dfa3c645ec957ba3c83e200" + "sha256:97c909a6ec5ad421573c18ae67fc6ea4232502cd30cffaf03bfcb584d9df652d", + "sha256:eeebe304161db6828413dc358ea80ece52f4ddbc8ecde4dd58978d5861a09293" ], - "markers": "python_version >= '3.7'", - "version": "==1.27.2" + "version": "==1.27.6" }, "brotli": { "hashes": [ @@ -144,7 +142,6 @@ "sha256:6a94c6402995a99c3970cc7e4884bb60b4a8639938157eeed436098bf9831757", "sha256:f9f17d2aec496a9aa6b76f53e3b614c965223c061982d434d160f930c698a9db" ], - "markers": "python_version ~= '3.7'", "version": "==5.2.0" }, "catalogue": { @@ -152,7 +149,6 @@ "sha256:535d33ae79ebd21ca298551d85da186ae8b8e1df36b0fb0246da774163ec2d6b", "sha256:cab4feda641fe05da1e6a1a9d123b0869d5ca324dcd93d4a5c384408ab62e7fb" ], - "markers": "python_version >= '3.6'", "version": "==2.0.7" }, "certifi": { @@ -160,7 +156,6 @@ "sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7", "sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a" ], - "markers": "python_version >= '3.6'", "version": "==2022.5.18.1" }, "charset-normalizer": { @@ -168,7 +163,6 @@ "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" ], - "markers": "python_version >= '3'", "version": "==2.0.12" }, "click": { @@ -176,7 +170,6 @@ "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1", "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb" ], - "markers": "python_version >= '3.6'", "version": "==8.0.4" }, "cryptg": { @@ -261,24 +254,14 @@ "index": "pypi", "version": "==0.2.0" }, - "filelock": { - "hashes": [ - "sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404", - "sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04" - ], - "markers": "python_version >= '3.7'", - "version": "==3.7.1" - }, "future": { "hashes": [ "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d" ], - "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'", "version": "==0.18.2" }, "gabber": { - "git": "https://github.com/stanfordio/gabber.git", - "ref": "a032db8047fa6b762b2fc127b08ee37d6ad9e110" + "git": "https://github.com/stanfordio/gabber.git" }, "gogettr": { "hashes": [ @@ -290,19 +273,17 @@ }, "google-auth": { "hashes": [ - "sha256:1ba4938e032b73deb51e59c4656a00e0939cf0b1112575099f136babb4563312", - "sha256:349ac49b18b01019453cc99c11c92ed772739778c92f184002b7ab3a5b7ac77d" + "sha256:8a954960f852d5f19e6af14dd8e75c20159609e85d8db37e4013cc8c3824a7e1", + "sha256:df549a1433108801b11bdcc0e312eaf0d5f0500db42f0523e4d65c78722e8475" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", - "version": "==2.6.6" + "version": "==2.7.0" }, "google-auth-oauthlib": { "hashes": [ - "sha256:24f67735513c4c7134dbde2f1dee5a1deb6acc8dfcb577d7bff30d213a28e7b0", - "sha256:30596b824fc6808fdaca2f048e4998cc40fb4b3599eaea66d28dc7085b36c5b8" + "sha256:6d6161d0ec0a62e2abf2207c6071c117ec5897b300823c4bb2d963ee86e20e4f", + "sha256:d5e98a71203330699f92a26bc08847a92e8c3b1b8d82a021f1af34164db143ae" ], - "markers": "python_version >= '3.6'", - "version": "==0.5.1" + "version": "==0.5.2" }, "greenlet": { "hashes": [ @@ -377,22 +358,20 @@ "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" ], - "markers": "python_version >= '3'", "version": "==3.3" }, "instaloader": { "hashes": [ - "sha256:7fa6147810eedcc1dedcdec8cfa1f220c9379ab8faeab6a336a7c181d944e2e4" + "sha256:77d4a140aafd1a9f48765db1f5ede9b74136eda67f428bfc392d7440b26ae74c" ], "index": "pypi", - "version": "==4.9" + "version": "==4.9.1" }, "jinja2": { "hashes": [ "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852", "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61" ], - "markers": "python_version >= '3.7'", "version": "==3.1.2" }, "jmespath": { @@ -400,7 +379,6 @@ "sha256:a490e280edd1f57d6de88636992d05b71e97d69a26a19f058ecf7d304474bf5e", "sha256:e8dcd576ed616f14ec02eed0005c85973b5890083313860136657e24784e4c04" ], - "markers": "python_version >= '3.7'", "version": "==1.0.0" }, "langcodes": { @@ -408,7 +386,6 @@ "sha256:4d89fc9acb6e9c8fdef70bcdf376113a3db09b67285d9e1d534de6d8818e7e69", "sha256:794d07d5a28781231ac335a1561b8442f8648ca07cd518310aeb45d6f0807ef6" ], - "markers": "python_version >= '3.6'", "version": "==3.3.0" }, "langdetect": { @@ -421,11 +398,11 @@ }, "loguru": { "hashes": [ - "sha256:b28e72ac7a98be3d28ad28570299a393dfcd32e5e3f6a353dec94675767b6319", - "sha256:f8087ac396b5ee5f67c963b495d615ebbceac2796379599820e324419d53667c" + "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c", + "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3" ], "index": "pypi", - "version": "==0.5.3" + "version": "==0.6.0" }, "lxml": { "hashes": [ @@ -493,7 +470,7 @@ "sha256:f6d23a01921b741774f35e924d418a43cf03eca1444f3fdfd7978d35a5aaab8b", "sha256:fcdf70191f0d1761d190a436db06a46f05af60e1410e1507935f0332280c9268" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "index": "pypi", "version": "==4.9.0" }, "markupsafe": { @@ -539,7 +516,6 @@ "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a", "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7" ], - "markers": "python_version >= '3.7'", "version": "==2.1.1" }, "murmurhash": { @@ -571,7 +547,6 @@ "sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1", "sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed" ], - "markers": "python_version >= '3.5' and python_version < '4'", "version": "==1.45.1" }, "numpy": { @@ -599,7 +574,6 @@ "sha256:f5a1c7c45ff29db501f9e38a360aedd833e355c14c75155ba2bd46ee3799e30a", "sha256:fde47931544086a648b12ee7c9ccf30edd6c6db776005fb07e4a019a04980042" ], - "markers": "python_version >= '3.8'", "version": "==1.23.0rc2" }, "oauthlib": { @@ -607,50 +581,29 @@ "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2", "sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe" ], - "markers": "python_version >= '3.6'", "version": "==3.2.0" }, + "ocrd-pyexiftool": { + "hashes": [ + "sha256:13d7aeabd765256e7640e4198cc742538a9c458b34aca6644b356c6e908c922a", + "sha256:457a432d167174e93f63a487879ea767b0ff54aef539e40586fffe5fb9050461", + "sha256:9c77e753769857657069de76d2c4b592efbd99db3974a76df561fd0ca75cec0e" + ], + "index": "pypi", + "version": "==0.2.0" + }, "packaging": { "hashes": [ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522" ], - "markers": "python_version >= '3.6'", "version": "==21.3" }, - "pandas": { - "hashes": [ - "sha256:0010771bd9223f7afe5f051eb47c4a49534345dfa144f2f5470b27189a4dd3b5", - "sha256:061609334a8182ab500a90fe66d46f6f387de62d3a9cb9aa7e62e3146c712167", - "sha256:09d8be7dd9e1c4c98224c4dfe8abd60d145d934e9fc1f5f411266308ae683e6a", - "sha256:295872bf1a09758aba199992c3ecde455f01caf32266d50abc1a073e828a7b9d", - "sha256:3228198333dd13c90b6434ddf61aa6d57deaca98cf7b654f4ad68a2db84f8cfe", - "sha256:385c52e85aaa8ea6a4c600a9b2821181a51f8be0aee3af6f2dcb41dafc4fc1d0", - "sha256:51649ef604a945f781105a6d2ecf88db7da0f4868ac5d45c51cb66081c4d9c73", - "sha256:5586cc95692564b441f4747c47c8a9746792e87b40a4680a2feb7794defb1ce3", - "sha256:5a206afa84ed20e07603f50d22b5f0db3fb556486d8c2462d8bc364831a4b417", - "sha256:5b79af3a69e5175c6fa7b4e046b21a646c8b74e92c6581a9d825687d92071b51", - "sha256:5c54ea4ef3823108cd4ec7fb27ccba4c3a775e0f83e39c5e17f5094cb17748bc", - "sha256:8c5bf555b6b0075294b73965adaafb39cf71c312e38c5935c93d78f41c19828a", - "sha256:92bc1fc585f1463ca827b45535957815b7deb218c549b7c18402c322c7549a12", - "sha256:95c1e422ced0199cf4a34385ff124b69412c4bc912011ce895582bee620dfcaa", - "sha256:b8134651258bce418cb79c71adeff0a44090c98d955f6953168ba16cc285d9f7", - "sha256:be67c782c4f1b1f24c2f16a157e12c2693fd510f8df18e3287c77f33d124ed07", - "sha256:c072c7f06b9242c855ed8021ff970c0e8f8b10b35e2640c657d2a541c5950f59", - "sha256:d0d4f13e4be7ce89d7057a786023c461dd9370040bdb5efa0a7fe76b556867a0", - "sha256:df82739e00bb6daf4bba4479a40f38c718b598a84654cbd8bb498fd6b0aa8c16", - "sha256:f549097993744ff8c41b5e8f2f0d3cbfaabe89b4ae32c8c08ead6cc535b80139", - "sha256:ff08a14ef21d94cdf18eef7c569d66f2e24e0bc89350bcd7d243dd804e3b5eb2" - ], - "markers": "python_version >= '3.8'", - "version": "==1.4.2" - }, "pathy": { "hashes": [ "sha256:25fd04cec6393661113086730ce69c789d121bea83ab1aa18452e8fd42faf29a", "sha256:838624441f799a06b446a657e4ecc9ebc3fdd05234397e044a7c87e8f6e76b1c" ], - "markers": "python_version >= '3.6'", "version": "==0.6.1" }, "pillow": { @@ -694,12 +647,10 @@ "sha256:f3f6a6034140e9e17e9abc175fc7a266a6e63652028e157750bd98e804a8ed9a", "sha256:ffde4c6fabb52891d81606411cbfaf77756e3b561b566efd270b3ed3791fde4e" ], - "markers": "python_version >= '3.7'", "version": "==9.1.1" }, "polyphemus": { - "git": "https://github.com/bellingcat/polyphemus", - "ref": "b18e5591fa4f903e5506742c2e3f17d45bb88755" + "git": "https://github.com/bellingcat/polyphemus" }, "preshed": { "hashes": [ @@ -814,7 +765,6 @@ "sha256:f75009715dcf4a3d680c2338ab19dac5498f8121173a929872950f4fb3a48fbf", "sha256:f8524b8bc89470cec7ac51734907818d3620fb1637f8f8b542d650ebec42a126" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==3.14.1" }, "pydantic": { @@ -842,29 +792,15 @@ "sha256:ea5cb40a3b23b3265f6325727ddfc45141b08ed665458be8c6285e7b85bd73a1", "sha256:fec866a0b59f372b7e776f2d7308511784dace622e0992a0b59ea3ccee0ae833" ], - "markers": "python_full_version >= '3.6.1'", "version": "==1.8.2" }, - "pyexiftool": { - "git": "https://github.com/smarnach/pyexiftool.git", - "ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f" - }, "pyparsing": { "hashes": [ "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb", "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc" ], - "markers": "python_full_version >= '3.6.8'", "version": "==3.0.9" }, - "pysocks": { - "hashes": [ - "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299", - "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", - "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0" - ], - "version": "==1.7.1" - }, "pytesseract": { "hashes": [ "sha256:7e2bafc7f48d1bb71443ce4633a56f5e21925a98f220a36c336297edcd1956d0", @@ -878,7 +814,6 @@ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", "version": "==2.8.2" }, "pytz": { @@ -894,7 +829,6 @@ "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6", "sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", "version": "==0.1.0.post0" }, "ratelimit": { @@ -981,23 +915,21 @@ "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d", "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4" ], - "markers": "python_version >= '3.6'", "version": "==2022.3.2" }, "requests": { "hashes": [ - "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", - "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" + "sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f", + "sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b" ], "index": "pypi", - "version": "==2.27.1" + "version": "==2.28.0" }, "requests-oauthlib": { "hashes": [ "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5", "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.3.1" }, "rsa": { @@ -1005,7 +937,6 @@ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], - "markers": "python_version >= '3.6' and python_version < '4'", "version": "==4.8" }, "s3transfer": { @@ -1013,23 +944,13 @@ "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd", "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947" ], - "markers": "python_version >= '3.7'", "version": "==0.6.0" }, - "setuptools": { - "hashes": [ - "sha256:68e45d17c9281ba25dc0104eadd2647172b3472d9e01f911efa57965e8d51a36", - "sha256:a43bdedf853c670e5fed28e5623403bad2f73cf02f9a2774e91def6bda8265a7" - ], - "markers": "python_version >= '3.7'", - "version": "==62.3.2" - }, "six": { "hashes": [ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", "version": "==1.16.0" }, "smart-open": { @@ -1037,19 +958,16 @@ "sha256:71d14489da58b60ce12fc3ecb823facc59a8b23cd1b58edb97175640350d3a62", "sha256:75abf758717a92a8f53aa96953f0c245c8cedf8e1e4184903db3659b419d4c17" ], - "markers": "python_version >= '3.6' and python_version < '4'", "version": "==5.2.1" }, "snscrape": { - "git": "https://github.com/bellingcat/snscrape", - "ref": "0822a9c3548c4d0736a98f617d823b8475d24fda" + "git": "https://github.com/bellingcat/snscrape" }, "soupsieve": { "hashes": [ "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" ], - "markers": "python_version >= '3.6'", "version": "==2.3.2.post1" }, "spacy": { @@ -1079,7 +997,6 @@ "sha256:4f7dcbc4e6c8e8cb4eadbb009f9c0a1a2a67442e0032c8d6776c9470c3759903", "sha256:dfd58b0cc65b3596cb06f7b95e7bf4fff34668297c59eb179eb050db07b199df" ], - "markers": "python_version >= '3.6'", "version": "==3.0.9" }, "spacy-loggers": { @@ -1087,7 +1004,6 @@ "sha256:d48c9313a577ad1818da961cf6db71a73fd1e556ae47e6e68d7e28b541d11e18", "sha256:e75d44f4cf99e6763d7132ca7c8c420e0a92790222a08bc8eb9e24ea2c13536e" ], - "markers": "python_version >= '3.6'", "version": "==1.0.2" }, "sqlalchemy": { @@ -1101,6 +1017,7 @@ "sha256:3197441772dc3b1c6419f13304402f2418a18d7fe78000aa5a026e7100836739", "sha256:3688f92c62db6c5df268e2264891078f17ecb91e3141b400f2e28d0f75796dea", "sha256:3862a069a24f354145e01a76c7c720c263d62405fe5bed038c46a7ce900f5dd6", + "sha256:4a17c1a1152ca4c29d992714aa9df3054da3af1598e02134f2e7314a32ef69d8", "sha256:4c1d9fb3931e27d59166bb5c4dcc911400fee51082cfba66ceb19ac954ade068", "sha256:4e8706919829d455a9fa687c6bbd1b048e36fec3919a59f2d366247c2bfdbd9c", "sha256:50c8eaf44c3fed5ba6758d375de25f163e46137c39fda3a72b9ee1d1bb327dfc", @@ -1153,7 +1070,6 @@ "sha256:f96af9fde9f58d5923091fa723fa0fed58a83781b98e143a5d1fac5e738b9f0d", "sha256:fb08416fd6ef04c51fdeefd6d28592b64563b2853243c571a9b0d67403b5be7f" ], - "markers": "python_version >= '3.6'", "version": "==2.4.3" }, "telethon": { @@ -1186,7 +1102,6 @@ "sha256:eba973fe229e7fa86b99f2c5e2724f7f19040ac75a8ef7c8b23b434dac1eadea", "sha256:fd2d49a80a6c95be4eb0f8370a22eef903ecad10b65762d39c9b192abf905f7c" ], - "markers": "python_version >= '3.6'", "version": "==8.0.17" }, "tqdm": { @@ -1202,7 +1117,6 @@ "sha256:5646aef0d936b2c761a10393f0384ee6b5c7fe0bb3e5cd710b17134ca1d99cff", "sha256:e8467f0ebac0c81366c2168d6ad9f888efdfb6d4e1d3d5b4a004f46fa444b5c3" ], - "markers": "python_version >= '3.6'", "version": "==0.4.1" }, "typing-extensions": { @@ -1210,7 +1124,6 @@ "sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708", "sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376" ], - "markers": "python_version >= '3.7'", "version": "==4.2.0" }, "tzdata": { @@ -1226,7 +1139,6 @@ "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745", "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7" ], - "markers": "python_version >= '3.6'", "version": "==4.2" }, "urllib3": { @@ -1234,7 +1146,6 @@ "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.9" }, "wasabi": { @@ -1295,7 +1206,6 @@ "sha256:fab7c640815812ed5f10fbee7abbf58788d602046b7bb3af9b1ac753a6d5e916", "sha256:fc06cc8073c8e87072138ba1e431300e2d408f054b27047d047b549455066ff4" ], - "markers": "python_version >= '3.7'", "version": "==10.3" }, "yt-dlp": { @@ -1320,7 +1230,6 @@ "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==21.4.0" }, "babel": { @@ -1328,7 +1237,6 @@ "sha256:3f349e85ad3154559ac4930c3918247d319f21910d5ce4b25d439ed8693b98d2", "sha256:98aeaca086133efb3e1e2aad0396987490c8425929ddbcfe0550184fdc54cd13" ], - "markers": "python_version >= '3.6'", "version": "==2.10.1" }, "black": { @@ -1365,7 +1273,6 @@ "sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7", "sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a" ], - "markers": "python_version >= '3.6'", "version": "==2022.5.18.1" }, "charset-normalizer": { @@ -1373,7 +1280,6 @@ "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" ], - "markers": "python_version >= '3'", "version": "==2.0.12" }, "click": { @@ -1381,13 +1287,9 @@ "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1", "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb" ], - "markers": "python_version >= '3.6'", "version": "==8.0.4" }, "coverage": { - "extras": [ - "toml" - ], "hashes": [ "sha256:01c5615d13f3dd3aa8543afc069e5319cfa0c7d712f6e04b920431e5c564a749", "sha256:106c16dfe494de3193ec55cac9640dd039b66e196e4641fa8ac396181578b982", @@ -1431,7 +1333,6 @@ "sha256:fdb6f7bd51c2d1714cea40718f6149ad9be6a2ee7d93b19e9f00934c0f2a74d9", "sha256:ffa9297c3a453fba4717d06df579af42ab9a28022444cae7fa605af4df612d54" ], - "markers": "python_version >= '3.7'", "version": "==6.4.1" }, "docutils": { @@ -1439,7 +1340,6 @@ "sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125", "sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==0.17.1" }, "idna": { @@ -1447,7 +1347,6 @@ "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" ], - "markers": "python_version >= '3'", "version": "==3.3" }, "imagesize": { @@ -1455,7 +1354,6 @@ "sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c", "sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.3.0" }, "importlib-metadata": { @@ -1478,7 +1376,6 @@ "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852", "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61" ], - "markers": "python_version >= '3.7'", "version": "==3.1.2" }, "markupsafe": { @@ -1524,7 +1421,6 @@ "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a", "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7" ], - "markers": "python_version >= '3.7'", "version": "==2.1.1" }, "mypy-extensions": { @@ -1539,7 +1435,6 @@ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522" ], - "markers": "python_version >= '3.6'", "version": "==21.3" }, "pathspec": { @@ -1554,7 +1449,6 @@ "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788", "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19" ], - "markers": "python_version >= '3.7'", "version": "==2.5.2" }, "pluggy": { @@ -1562,7 +1456,6 @@ "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159", "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3" ], - "markers": "python_version >= '3.6'", "version": "==1.0.0" }, "py": { @@ -1570,7 +1463,6 @@ "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==1.11.0" }, "pygments": { @@ -1578,7 +1470,6 @@ "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb", "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519" ], - "markers": "python_version >= '3.6'", "version": "==2.12.0" }, "pyparsing": { @@ -1586,7 +1477,6 @@ "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb", "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc" ], - "markers": "python_full_version >= '3.6.8'", "version": "==3.0.9" }, "pytest": { @@ -1631,11 +1521,11 @@ }, "requests": { "hashes": [ - "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", - "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" + "sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f", + "sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b" ], "index": "pypi", - "version": "==2.27.1" + "version": "==2.28.0" }, "snowballstemmer": { "hashes": [ @@ -1665,7 +1555,6 @@ "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a", "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58" ], - "markers": "python_version >= '3.5'", "version": "==1.0.2" }, "sphinxcontrib-devhelp": { @@ -1673,7 +1562,6 @@ "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4" ], - "markers": "python_version >= '3.5'", "version": "==1.0.2" }, "sphinxcontrib-htmlhelp": { @@ -1681,7 +1569,6 @@ "sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07", "sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2" ], - "markers": "python_version >= '3.6'", "version": "==2.0.0" }, "sphinxcontrib-jsmath": { @@ -1689,7 +1576,6 @@ "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" ], - "markers": "python_version >= '3.5'", "version": "==1.0.1" }, "sphinxcontrib-qthelp": { @@ -1697,7 +1583,6 @@ "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6" ], - "markers": "python_version >= '3.5'", "version": "==1.0.3" }, "sphinxcontrib-serializinghtml": { @@ -1705,7 +1590,6 @@ "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd", "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952" ], - "markers": "python_version >= '3.5'", "version": "==1.1.5" }, "tomli": { @@ -1713,7 +1597,6 @@ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" ], - "markers": "python_version >= '3.7'", "version": "==2.0.1" }, "typing-extensions": { @@ -1721,7 +1604,6 @@ "sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708", "sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376" ], - "markers": "python_version >= '3.7'", "version": "==4.2.0" }, "urllib3": { @@ -1729,7 +1611,6 @@ "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", "version": "==1.26.9" }, "zipp": { @@ -1737,7 +1618,6 @@ "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad", "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099" ], - "markers": "python_version >= '3.7'", "version": "==3.8.0" } } diff --git a/cisticola/base.py b/cisticola/base.py index f4a23f8..3c6651a 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -237,12 +237,12 @@ class Post: # replace is here in order to prevent catastrophic backtracking urls = re.findall(URL_REGEX, self.content.replace("::::::::", "")) - self.outlinks = urls + self.outlinks += urls HASHTAG_REGEX = r"(?:^|\s)[##]{1}(\w+)" hashtags = re.findall(HASHTAG_REGEX, self.content) - self.hashtags = hashtags + self.hashtags += hashtags # regex patterns for finding crypto addresses BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b' diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 973fc55..5dfa1f5 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -421,6 +421,9 @@ class ScraperController: session.commit() added += 1 + profile = scraper.get_profile(channel) + session.add(profile) + session.commit() logger.info( f"{scraper} found {added} new posts from {channel}") diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index c0cedc9..9942750 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -5,6 +5,7 @@ from html.parser import HTMLParser import dateparser import json from typing import Generator +from dateutil.relativedelta import relativedelta import requests from bs4 import BeautifulSoup @@ -70,7 +71,7 @@ class BitchuteScraper(Scraper): if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: return True - @logger.catch + @logger.catch(reraise = True) def get_profile(self, channel: Channel) -> RawChannelInfo: base_url = channel.url @@ -104,7 +105,7 @@ class BitchuteScraper(Scraper): profile = { 'description' : description_soup.text.strip(), 'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)], - 'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')), + 'created': parse_created(re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. '))), 'videos' : int(info_list[1].text.split('videos')[0].strip()), 'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'], 'owner_name' : owner_name, @@ -116,7 +117,7 @@ class BitchuteScraper(Scraper): return RawChannelInfo(scraper=self.__version__, platform=channel.platform, channel=channel.id, - raw_data=json.dumps(profile), + raw_data=json.dumps(profile, default = str), date_archived=datetime.now(timezone.utc)) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -484,4 +485,14 @@ def decode_cfemail(cfemail): return email -#---------------------------------------------------------------------------# \ No newline at end of file +#---------------------------------------------------------------------------# + +def parse_created(created): + + period_list = ['year', 'month', 'week', 'day'] + + periods = [period.strip() for period in created.split('ago')[0].strip().split(',')] + _kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()} + kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()} + + return datetime.now() - relativedelta(**kwargs) \ No newline at end of file diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 0c7177f..fe7adb3 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -14,7 +14,7 @@ BASE_URL = 'https://rumble.com' class RumbleScraper(Scraper): """An implementation of a Scraper for Rumble, using custom functions""" - __version__ = "RumbleScraper 0.0.1" + __version__ = "RumbleScraper 0.0.2" cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t') cookiefilename = 'cookiefile.txt' @@ -105,20 +105,38 @@ def process_video(video): views = None else: views = view_span.get('data-value') - + + author_a = video.find('a', {'rel': 'author'}) + if author_a is None: + author_id = None + author_name = None + else: + author_id = author_a['href'].split('/')[-1] + author_name = author_a.text + + video_link = BASE_URL + video.find('a', href = True)['href'] + r = make_request(url = video_link) + soup = BeautifulSoup(r.content, features = 'html.parser') + + content_div = soup.find('div', {'class': 'container content media-description'}) + info = { 'title' : video.find('h3').text, 'thumbnail' : video.find('img')['src'], - 'link' : BASE_URL + video.find('a', href = True)['href'], + 'link' : video_link, 'views' : views, 'rumbles' : rumbles, + 'content': '' if content_div is None else content_div.get_text('\n'), 'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'], - 'datetime' : datetime.fromisoformat(video.find('time')['datetime'])} + 'datetime' : datetime.fromisoformat(video.find('time')['datetime']), + 'author_id': author_id, + 'author_name': author_name} info['media_url'] = get_media_url(info['link']) return info + def get_channel_videos(url): page = 1 @@ -150,8 +168,15 @@ def get_channel_profile(url): thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'}) cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'}) + author_a = soup.find('a', {'rel': 'author'}) + if author_a is None: + author_id = None + else: + author_id = author_a['href'].split('/')[-1] + profile = { 'name': soup.find('h1').text, + 'id': author_id, 'verified': verified_svg is not None, 'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None, 'cover': cover_soup.get('src') if cover_soup else None, diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py index b4e0968..37df764 100644 --- a/cisticola/transformer/__init__.py +++ b/cisticola/transformer/__init__.py @@ -1,4 +1,6 @@ from .base import ETLController from .twitter import TwitterTransformer from .bitchute import BitchuteTransformer -from .telegram_telethon import TelegramTelethonTransformer \ No newline at end of file +from .telegram_telethon import TelegramTelethonTransformer +from .rumble import RumbleTransformer +from .gettr import GettrTransformer diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py index d0c5fe0..19fac56 100644 --- a/cisticola/transformer/bitchute.py +++ b/cisticola/transformer/bitchute.py @@ -1,11 +1,13 @@ import json from loguru import logger -from typing import Generator +from typing import Generator, Union, Callable +from datetime import datetime, timezone +import dateutil.parser from bs4 import BeautifulSoup from cisticola.transformer.base import Transformer -from cisticola.base import ScraperResult, Post, Image, Video, Media +from cisticola.base import RawChannelInfo, ScraperResult, Post, Image, Video, Media, Channel, ChannelInfo class BitchuteTransformer(Transformer): """A Bitchute specific ScraperResult, with a method ETL/transforming""" @@ -19,7 +21,7 @@ class BitchuteTransformer(Transformer): return False - def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]: + def transform_media(self, data: ScraperResult, insert: Callable, transformed: Post) -> Generator[Media, None, None]: raw = json.loads(data.raw_data) orig = raw['video_url'] @@ -27,9 +29,34 @@ class BitchuteTransformer(Transformer): m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig) - yield m + insert(m) - def transform(self, data: ScraperResult) -> Post: + def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) + + transformed = ChannelInfo( + raw_channel_info_id=data.id, + channel=data.channel, + platform_id=raw['owner_url'].strip('/').split('/')[-1], + platform=data.platform, + scraper=data.scraper, + transformer=self.__version__, + screenname=raw['owner_name'], + name=raw['owner_name'], + description=raw['description'], + description_url='', # does not exist for Bitchute + description_location='', # does not exist for Bitchute + followers=raw['subscribers'], + following=-1, # does not exist for Bitchute + verified=False, # does not exist for Bitchute + date_created=dateutil.parser.parse(raw['created']), + date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc) + ) + + transformed = insert(transformed) + + def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) soup = BeautifulSoup(raw['body'], features = 'html.parser') @@ -37,15 +64,17 @@ class BitchuteTransformer(Transformer): transformed = Post( raw_id=data.id, + platform_id=raw['id'], scraper=data.scraper, transformer=self.__version__, platform=data.platform, channel=data.channel, date=data.date, date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc), url=raw['url'], content=content, author_id=raw['author_id'], author_username=raw['author']) - return transformed + transformed = insert(transformed) \ No newline at end of file diff --git a/cisticola/transformer/gettr.py b/cisticola/transformer/gettr.py new file mode 100644 index 0000000..aff1264 --- /dev/null +++ b/cisticola/transformer/gettr.py @@ -0,0 +1,78 @@ +import json +from loguru import logger +from typing import Generator, Union, Callable +import dateutil.parser +from datetime import datetime, timezone + +from cisticola.transformer.base import Transformer +from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel + +class GettrTransformer(Transformer): + """A Gettr specific ScraperResult, with a method ETL/transforming""" + + __version__ = "GettrTransformer 0.0.1" + + def can_handle(self, data: ScraperResult) -> bool: + scraper = data.scraper.split(' ') + if scraper[0] == "GettrScraper": + return True + + return False + + def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) + + transformed = ChannelInfo( + raw_channel_info_id=data.id, + channel=data.channel, + platform_id=raw['_id'], + platform=data.platform, + scraper=data.scraper, + transformer=self.__version__, + screenname=raw['username'], + name=raw['nickname'], + description=raw['dsc'], + description_url=raw['website'], + description_location=raw['location'], + followers=raw['flg'], + following=raw['flw'], + verified=True if raw.get('infl') else False, + date_created=datetime.fromtimestamp(raw['cdate']*0.001), + date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc) + ) + + transformed = insert(transformed) + + + def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) + + if raw["activity"]["action"] == "shares_pst": + forwarded_from = raw["activity"]["uid"] + else: + forwarded_from = None + + transformed = Post( + raw_id=data.id, + platform_id=raw["_id"], + scraper=data.scraper, + transformer=self.__version__, + platform=data.platform, + channel=data.channel, + date=datetime.fromtimestamp(raw["activity"]["cdate"] / 1000.0), + date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc), + url="https://www.gettr.com/post/" + raw["_id"], + content=raw.get("txt", ""), + author_id=raw["receiver_id"], + author_username=raw["uid"], + hashtags=raw.get("htgs", []), + outlinks = list(filter(None, [raw.get("prevsrc")])), + forwarded_from = forwarded_from) + + insert(transformed) + + # media = self.process_media(raw, transformed.id, data) + # for m in media: + # insert(m) \ No newline at end of file diff --git a/cisticola/transformer/rumble.py b/cisticola/transformer/rumble.py new file mode 100644 index 0000000..91ef244 --- /dev/null +++ b/cisticola/transformer/rumble.py @@ -0,0 +1,70 @@ +import json +from loguru import logger +from typing import Generator, Union, Callable +import dateutil.parser +from datetime import datetime, timezone + +from cisticola.transformer.base import Transformer +from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel + +class RumbleTransformer(Transformer): + """A Rumble specific ScraperResult, with a method ETL/transforming""" + + __version__ = "RumbleTransformer 0.0.1" + + def can_handle(self, data: ScraperResult) -> bool: + scraper = data.scraper.split(' ') + if scraper[0] == "RumbleScraper": + return True + + return False + + def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) + + transformed = ChannelInfo( + raw_channel_info_id=data.id, + channel=data.channel, + platform_id=raw['id'], + platform=data.platform, + scraper=data.scraper, + transformer=self.__version__, + screenname=raw['id'], + name=raw['name'], + description='', # does not exist for Rumble + description_url='', # does not exist for Rumble + description_location='', # does not exist for Rumble + followers=raw['subscribers'], + following=-1, # does not exist for Rumble + verified=raw['verified'], + date_created=None, # does not exist for Rumble + date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc) + ) + + transformed = insert(transformed) + + + def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) + + transformed = Post( + raw_id=data.id, + platform_id=raw['media_url'].strip('/').split('/')[-1], + scraper=data.scraper, + transformer=self.__version__, + platform=data.platform, + channel=data.channel, + date=dateutil.parser.parse(raw['datetime']), + date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc), + url=raw['link'], + content=raw['content'], + author_id=raw['author_id'], + author_username=raw['author_name']) + + insert(transformed) + + # media = self.process_media(raw, transformed.id, data) + # for m in media: + # insert(m) \ No newline at end of file diff --git a/cisticola/transformer/telegram_telethon.py b/cisticola/transformer/telegram_telethon.py index f233516..d0a8cee 100644 --- a/cisticola/transformer/telegram_telethon.py +++ b/cisticola/transformer/telegram_telethon.py @@ -61,7 +61,7 @@ class TelegramTelethonTransformer(Transformer): self.bad_channels[orig_screenname] = True return "" - soup = BeautifulSoup(r.content) + soup = BeautifulSoup(r.content, features = 'lxml') post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id)}) name = "" @@ -181,14 +181,14 @@ class TelegramTelethonTransformer(Transformer): transformed = insert(transformed) - # for k in data.archived_urls: - # if data.archived_urls[k]: - # archived_url = data.archived_urls[k] - # ext = archived_url.split('.')[-1] + for k in data.archived_urls: + if data.archived_urls[k]: + archived_url = data.archived_urls[k] + ext = archived_url.split('.')[-1] - # if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv': - # insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k)) - # else: - # insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k)) + if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv': + insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k)) + else: + insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k)) \ No newline at end of file diff --git a/cisticola/transformer/twitter.py b/cisticola/transformer/twitter.py index 85ada05..f5e5a01 100644 --- a/cisticola/transformer/twitter.py +++ b/cisticola/transformer/twitter.py @@ -2,9 +2,10 @@ import json from loguru import logger from typing import Generator, Union, Callable import dateutil.parser +from datetime import datetime, timezone from cisticola.transformer.base import Transformer -from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel +from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel class TwitterTransformer(Transformer): """A Twitter specific ScraperResult, with a method ETL/transforming""" @@ -45,8 +46,33 @@ class TwitterTransformer(Transformer): yield m + def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) - def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]: + transformed = ChannelInfo( + raw_channel_info_id=data.id, + channel=data.channel, + platform_id=raw['id'], + platform=data.platform, + scraper=data.scraper, + transformer=self.__version__, + screenname=raw['username'], + name=raw['displayname'], + description=raw['rawDescription'], + description_url=raw['linkUrl'], + description_location=raw['location'], + followers=raw['followersCount'], + following=raw['friendsCount'], + verified=raw['verified'], + date_created=dateutil.parser.parse(raw['created']), + date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc) + ) + + transformed = insert(transformed) + + + def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) transformed = Post( @@ -58,6 +84,7 @@ class TwitterTransformer(Transformer): channel=data.channel, date=dateutil.parser.parse(raw['date']), date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc), url=raw['url'], content=raw['content'], author_id=raw['user']['id'], @@ -85,6 +112,7 @@ class TwitterTransformer(Transformer): channel=channel.id, date=dateutil.parser.parse(tweet['date']), date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc), url=tweet['url'], content=tweet['content'], author_id=tweet['user']['id'], @@ -109,7 +137,4 @@ class TwitterTransformer(Transformer): media = self.process_media(raw, transformed.id, data) for m in media: - insert(m) - - - + insert(m) \ No newline at end of file diff --git a/tests/transformer/bitchute.py b/tests/transformer/bitchute.py new file mode 100644 index 0000000..161d3e5 --- /dev/null +++ b/tests/transformer/bitchute.py @@ -0,0 +1,34 @@ +from sqlalchemy.orm import sessionmaker +import json + +import pytest + +from cisticola.base import Channel +from cisticola.scraper import BitchuteScraper +from cisticola.transformer import BitchuteTransformer +from cisticola.base import Post, Media + +@pytest.mark.media +def test_scrape_etl_bitchute(engine, controller, etl_controller, channel_kwargs): + controller.reset_db() + + channels = [Channel(**channel_kwargs['bitchute'])] + controller.register_scraper(scraper = BitchuteScraper()) + controller.scrape_channels(channels = channels, archive_media = True) + + etl_controller.register_transformer(BitchuteTransformer()) + etl_controller.transform_all_untransformed() + etl_controller.transform_all_untransformed_info() + + sessionfactory = sessionmaker() + sessionfactory.configure(bind=engine) + session = sessionfactory() + + posts = session.query(Post).all() + media = session.query(Media).all() + + assert len(posts) == 5 + # assert len(media) == 0 + + assert 'Pendant are something that the advanced ladies can fuse in her every day look' in posts[0].content + # assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file diff --git a/tests/transformer/gettr.py b/tests/transformer/gettr.py new file mode 100644 index 0000000..ef37b67 --- /dev/null +++ b/tests/transformer/gettr.py @@ -0,0 +1,34 @@ +from sqlalchemy.orm import sessionmaker +import json + +import pytest + +from cisticola.base import Channel +from cisticola.scraper import GettrScraper +from cisticola.transformer import GettrTransformer +from cisticola.base import Post, Media + +@pytest.mark.media +def test_scrape_etl_gettr(engine, controller, etl_controller, channel_kwargs): + controller.reset_db() + + channels = [Channel(**channel_kwargs['gettr'])] + controller.register_scraper(scraper = GettrScraper()) + controller.scrape_channels(channels = channels, archive_media = True) + + etl_controller.register_transformer(GettrTransformer()) + etl_controller.transform_all_untransformed() + etl_controller.transform_all_untransformed_info() + + sessionfactory = sessionmaker() + sessionfactory.configure(bind=engine) + session = sessionfactory() + + posts = session.query(Post).all() + media = session.query(Media).all() + + assert len(posts) == 23 + # assert len(media) == 0 + + assert 'Nigerian gender studies' in posts[-1].content + # assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file diff --git a/tests/transformer/rumble.py b/tests/transformer/rumble.py new file mode 100644 index 0000000..95450ed --- /dev/null +++ b/tests/transformer/rumble.py @@ -0,0 +1,34 @@ +from sqlalchemy.orm import sessionmaker +import json + +import pytest + +from cisticola.base import Channel +from cisticola.scraper import RumbleScraper +from cisticola.transformer import RumbleTransformer +from cisticola.base import Post, Media + +@pytest.mark.media +def test_scrape_etl_rumble(engine, controller, etl_controller, channel_kwargs): + controller.reset_db() + + channels = [Channel(**channel_kwargs['rumble'])] + controller.register_scraper(scraper = RumbleScraper()) + controller.scrape_channels(channels = channels, archive_media = True) + + etl_controller.register_transformer(RumbleTransformer()) + etl_controller.transform_all_untransformed() + etl_controller.transform_all_untransformed_info() + + sessionfactory = sessionmaker() + sessionfactory.configure(bind=engine) + session = sessionfactory() + + posts = session.query(Post).all() + media = session.query(Media).all() + + assert len(posts) == 7 + # assert len(media) == 0 + + assert '#whitegold #icedoutcuban' in posts[0].content + # assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file diff --git a/tests/transformer/telegram_telethon.py b/tests/transformer/telegram_telethon.py new file mode 100644 index 0000000..a5389b6 --- /dev/null +++ b/tests/transformer/telegram_telethon.py @@ -0,0 +1,34 @@ +from sqlalchemy.orm import sessionmaker, with_polymorphic +import json + +import pytest + +from cisticola.base import Channel +from cisticola.scraper import TelegramTelethonScraper +from cisticola.transformer import TelegramTelethonTransformer +from cisticola.base import Post, Media + +@pytest.mark.media +def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channel_kwargs): + controller.reset_db() + + channels = [Channel(**channel_kwargs['telegram'])] + controller.register_scraper(scraper = TelegramTelethonScraper()) + controller.scrape_channels(channels = channels, archive_media = True) + + etl_controller.register_transformer(TelegramTelethonTransformer()) + etl_controller.transform_all_untransformed() + etl_controller.transform_all_untransformed_info() + + sessionfactory = sessionmaker() + sessionfactory.configure(bind=engine) + session = sessionfactory() + + posts = session.query(Post).all() + media = session.query(Media).all() + + assert len(posts) == 19 + assert len(media) == 13 + + assert posts[16].content == "Taking pre-orders now" + assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280" \ No newline at end of file diff --git a/tests/transformer/twitter.py b/tests/transformer/twitter.py index 3c50d1c..8799aad 100644 --- a/tests/transformer/twitter.py +++ b/tests/transformer/twitter.py @@ -1,4 +1,4 @@ -from sqlalchemy.orm import sessionmaker, with_polymorphic +from sqlalchemy.orm import sessionmaker import json import pytest @@ -18,6 +18,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): etl_controller.register_transformer(TwitterTransformer()) etl_controller.transform_all_untransformed() + etl_controller.transform_all_untransformed_info() sessionfactory = sessionmaker() sessionfactory.configure(bind=engine) @@ -26,8 +27,8 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): posts = session.query(Post).all() media = session.query(Media).all() - assert len(posts) == 10 - assert len(media) == 7 + assert len(posts) == 12 + assert len(media) == 4 - assert posts[-1].content == "BARN" - assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file + assert posts[2].content == "BARN" + assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file