Merge branch 'more-channel-info-transformers' of https://github.com/bellingcat/cisticola into main

This commit is contained in:
Logan Williams
2022-06-10 08:07:02 +00:00
17 changed files with 456 additions and 195 deletions

View File

@@ -9,6 +9,7 @@ loguru = "*"
gogettr = "*"
requests = "*"
bs4 = "*"
lxml = "*"
dateparser = "*"
boto3 = "*"
ffmpeg-python = "*"

196
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "3f0312747083221d57ac2c4ce96786a6ead34aa3a3a3519fed4ea4382f672633"
"sha256": "13cc50755a59b2cd8bf93049a9a695aa27d35b973b0bdc154af5d21ce48fd57f"
},
"pipfile-spec": 6,
"requires": {
@@ -21,7 +21,6 @@
"sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30",
"sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"
],
"markers": "python_version >= '3.6'",
"version": "==4.11.1"
},
"blis": {
@@ -50,19 +49,18 @@
},
"boto3": {
"hashes": [
"sha256:927b5e8e2decad746e6c32bb81f15c2ea9ab4398286134d21f6742493eb893f6",
"sha256:e3c10adc7be890b147568a4162d9cafb876f11f87460c4a0dc90742d6d4ebe7c"
"sha256:1c13d555172cf88eb645af2429e4a7f42be85e365d6ffc110c952a556d3f8808",
"sha256:4af6a8bc5110b5f9d2fbd00a3c110e4c4cc36fae78d05afa354831f5789e363b"
],
"index": "pypi",
"version": "==1.24.2"
"version": "==1.24.6"
},
"botocore": {
"hashes": [
"sha256:131f71fe16ef84f9e0e72c54d2e230a6d8e79dd3947f507259a129649649a35d",
"sha256:b7cdd4f4a6395a084a381a7d2a25b177e6de5f8a4dfa3c645ec957ba3c83e200"
"sha256:97c909a6ec5ad421573c18ae67fc6ea4232502cd30cffaf03bfcb584d9df652d",
"sha256:eeebe304161db6828413dc358ea80ece52f4ddbc8ecde4dd58978d5861a09293"
],
"markers": "python_version >= '3.7'",
"version": "==1.27.2"
"version": "==1.27.6"
},
"brotli": {
"hashes": [
@@ -144,7 +142,6 @@
"sha256:6a94c6402995a99c3970cc7e4884bb60b4a8639938157eeed436098bf9831757",
"sha256:f9f17d2aec496a9aa6b76f53e3b614c965223c061982d434d160f930c698a9db"
],
"markers": "python_version ~= '3.7'",
"version": "==5.2.0"
},
"catalogue": {
@@ -152,7 +149,6 @@
"sha256:535d33ae79ebd21ca298551d85da186ae8b8e1df36b0fb0246da774163ec2d6b",
"sha256:cab4feda641fe05da1e6a1a9d123b0869d5ca324dcd93d4a5c384408ab62e7fb"
],
"markers": "python_version >= '3.6'",
"version": "==2.0.7"
},
"certifi": {
@@ -160,7 +156,6 @@
"sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7",
"sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a"
],
"markers": "python_version >= '3.6'",
"version": "==2022.5.18.1"
},
"charset-normalizer": {
@@ -168,7 +163,6 @@
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
],
"markers": "python_version >= '3'",
"version": "==2.0.12"
},
"click": {
@@ -176,7 +170,6 @@
"sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1",
"sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb"
],
"markers": "python_version >= '3.6'",
"version": "==8.0.4"
},
"cryptg": {
@@ -261,24 +254,14 @@
"index": "pypi",
"version": "==0.2.0"
},
"filelock": {
"hashes": [
"sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404",
"sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04"
],
"markers": "python_version >= '3.7'",
"version": "==3.7.1"
},
"future": {
"hashes": [
"sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'",
"version": "==0.18.2"
},
"gabber": {
"git": "https://github.com/stanfordio/gabber.git",
"ref": "a032db8047fa6b762b2fc127b08ee37d6ad9e110"
"git": "https://github.com/stanfordio/gabber.git"
},
"gogettr": {
"hashes": [
@@ -290,19 +273,17 @@
},
"google-auth": {
"hashes": [
"sha256:1ba4938e032b73deb51e59c4656a00e0939cf0b1112575099f136babb4563312",
"sha256:349ac49b18b01019453cc99c11c92ed772739778c92f184002b7ab3a5b7ac77d"
"sha256:8a954960f852d5f19e6af14dd8e75c20159609e85d8db37e4013cc8c3824a7e1",
"sha256:df549a1433108801b11bdcc0e312eaf0d5f0500db42f0523e4d65c78722e8475"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==2.6.6"
"version": "==2.7.0"
},
"google-auth-oauthlib": {
"hashes": [
"sha256:24f67735513c4c7134dbde2f1dee5a1deb6acc8dfcb577d7bff30d213a28e7b0",
"sha256:30596b824fc6808fdaca2f048e4998cc40fb4b3599eaea66d28dc7085b36c5b8"
"sha256:6d6161d0ec0a62e2abf2207c6071c117ec5897b300823c4bb2d963ee86e20e4f",
"sha256:d5e98a71203330699f92a26bc08847a92e8c3b1b8d82a021f1af34164db143ae"
],
"markers": "python_version >= '3.6'",
"version": "==0.5.1"
"version": "==0.5.2"
},
"greenlet": {
"hashes": [
@@ -377,22 +358,20 @@
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
],
"markers": "python_version >= '3'",
"version": "==3.3"
},
"instaloader": {
"hashes": [
"sha256:7fa6147810eedcc1dedcdec8cfa1f220c9379ab8faeab6a336a7c181d944e2e4"
"sha256:77d4a140aafd1a9f48765db1f5ede9b74136eda67f428bfc392d7440b26ae74c"
],
"index": "pypi",
"version": "==4.9"
"version": "==4.9.1"
},
"jinja2": {
"hashes": [
"sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852",
"sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"
],
"markers": "python_version >= '3.7'",
"version": "==3.1.2"
},
"jmespath": {
@@ -400,7 +379,6 @@
"sha256:a490e280edd1f57d6de88636992d05b71e97d69a26a19f058ecf7d304474bf5e",
"sha256:e8dcd576ed616f14ec02eed0005c85973b5890083313860136657e24784e4c04"
],
"markers": "python_version >= '3.7'",
"version": "==1.0.0"
},
"langcodes": {
@@ -408,7 +386,6 @@
"sha256:4d89fc9acb6e9c8fdef70bcdf376113a3db09b67285d9e1d534de6d8818e7e69",
"sha256:794d07d5a28781231ac335a1561b8442f8648ca07cd518310aeb45d6f0807ef6"
],
"markers": "python_version >= '3.6'",
"version": "==3.3.0"
},
"langdetect": {
@@ -421,11 +398,11 @@
},
"loguru": {
"hashes": [
"sha256:b28e72ac7a98be3d28ad28570299a393dfcd32e5e3f6a353dec94675767b6319",
"sha256:f8087ac396b5ee5f67c963b495d615ebbceac2796379599820e324419d53667c"
"sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c",
"sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"
],
"index": "pypi",
"version": "==0.5.3"
"version": "==0.6.0"
},
"lxml": {
"hashes": [
@@ -493,7 +470,7 @@
"sha256:f6d23a01921b741774f35e924d418a43cf03eca1444f3fdfd7978d35a5aaab8b",
"sha256:fcdf70191f0d1761d190a436db06a46f05af60e1410e1507935f0332280c9268"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"index": "pypi",
"version": "==4.9.0"
},
"markupsafe": {
@@ -539,7 +516,6 @@
"sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
"sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
],
"markers": "python_version >= '3.7'",
"version": "==2.1.1"
},
"murmurhash": {
@@ -571,7 +547,6 @@
"sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1",
"sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed"
],
"markers": "python_version >= '3.5' and python_version < '4'",
"version": "==1.45.1"
},
"numpy": {
@@ -599,7 +574,6 @@
"sha256:f5a1c7c45ff29db501f9e38a360aedd833e355c14c75155ba2bd46ee3799e30a",
"sha256:fde47931544086a648b12ee7c9ccf30edd6c6db776005fb07e4a019a04980042"
],
"markers": "python_version >= '3.8'",
"version": "==1.23.0rc2"
},
"oauthlib": {
@@ -607,50 +581,29 @@
"sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
"sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe"
],
"markers": "python_version >= '3.6'",
"version": "==3.2.0"
},
"ocrd-pyexiftool": {
"hashes": [
"sha256:13d7aeabd765256e7640e4198cc742538a9c458b34aca6644b356c6e908c922a",
"sha256:457a432d167174e93f63a487879ea767b0ff54aef539e40586fffe5fb9050461",
"sha256:9c77e753769857657069de76d2c4b592efbd99db3974a76df561fd0ca75cec0e"
],
"index": "pypi",
"version": "==0.2.0"
},
"packaging": {
"hashes": [
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
"sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"
],
"markers": "python_version >= '3.6'",
"version": "==21.3"
},
"pandas": {
"hashes": [
"sha256:0010771bd9223f7afe5f051eb47c4a49534345dfa144f2f5470b27189a4dd3b5",
"sha256:061609334a8182ab500a90fe66d46f6f387de62d3a9cb9aa7e62e3146c712167",
"sha256:09d8be7dd9e1c4c98224c4dfe8abd60d145d934e9fc1f5f411266308ae683e6a",
"sha256:295872bf1a09758aba199992c3ecde455f01caf32266d50abc1a073e828a7b9d",
"sha256:3228198333dd13c90b6434ddf61aa6d57deaca98cf7b654f4ad68a2db84f8cfe",
"sha256:385c52e85aaa8ea6a4c600a9b2821181a51f8be0aee3af6f2dcb41dafc4fc1d0",
"sha256:51649ef604a945f781105a6d2ecf88db7da0f4868ac5d45c51cb66081c4d9c73",
"sha256:5586cc95692564b441f4747c47c8a9746792e87b40a4680a2feb7794defb1ce3",
"sha256:5a206afa84ed20e07603f50d22b5f0db3fb556486d8c2462d8bc364831a4b417",
"sha256:5b79af3a69e5175c6fa7b4e046b21a646c8b74e92c6581a9d825687d92071b51",
"sha256:5c54ea4ef3823108cd4ec7fb27ccba4c3a775e0f83e39c5e17f5094cb17748bc",
"sha256:8c5bf555b6b0075294b73965adaafb39cf71c312e38c5935c93d78f41c19828a",
"sha256:92bc1fc585f1463ca827b45535957815b7deb218c549b7c18402c322c7549a12",
"sha256:95c1e422ced0199cf4a34385ff124b69412c4bc912011ce895582bee620dfcaa",
"sha256:b8134651258bce418cb79c71adeff0a44090c98d955f6953168ba16cc285d9f7",
"sha256:be67c782c4f1b1f24c2f16a157e12c2693fd510f8df18e3287c77f33d124ed07",
"sha256:c072c7f06b9242c855ed8021ff970c0e8f8b10b35e2640c657d2a541c5950f59",
"sha256:d0d4f13e4be7ce89d7057a786023c461dd9370040bdb5efa0a7fe76b556867a0",
"sha256:df82739e00bb6daf4bba4479a40f38c718b598a84654cbd8bb498fd6b0aa8c16",
"sha256:f549097993744ff8c41b5e8f2f0d3cbfaabe89b4ae32c8c08ead6cc535b80139",
"sha256:ff08a14ef21d94cdf18eef7c569d66f2e24e0bc89350bcd7d243dd804e3b5eb2"
],
"markers": "python_version >= '3.8'",
"version": "==1.4.2"
},
"pathy": {
"hashes": [
"sha256:25fd04cec6393661113086730ce69c789d121bea83ab1aa18452e8fd42faf29a",
"sha256:838624441f799a06b446a657e4ecc9ebc3fdd05234397e044a7c87e8f6e76b1c"
],
"markers": "python_version >= '3.6'",
"version": "==0.6.1"
},
"pillow": {
@@ -694,12 +647,10 @@
"sha256:f3f6a6034140e9e17e9abc175fc7a266a6e63652028e157750bd98e804a8ed9a",
"sha256:ffde4c6fabb52891d81606411cbfaf77756e3b561b566efd270b3ed3791fde4e"
],
"markers": "python_version >= '3.7'",
"version": "==9.1.1"
},
"polyphemus": {
"git": "https://github.com/bellingcat/polyphemus",
"ref": "b18e5591fa4f903e5506742c2e3f17d45bb88755"
"git": "https://github.com/bellingcat/polyphemus"
},
"preshed": {
"hashes": [
@@ -814,7 +765,6 @@
"sha256:f75009715dcf4a3d680c2338ab19dac5498f8121173a929872950f4fb3a48fbf",
"sha256:f8524b8bc89470cec7ac51734907818d3620fb1637f8f8b542d650ebec42a126"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==3.14.1"
},
"pydantic": {
@@ -842,29 +792,15 @@
"sha256:ea5cb40a3b23b3265f6325727ddfc45141b08ed665458be8c6285e7b85bd73a1",
"sha256:fec866a0b59f372b7e776f2d7308511784dace622e0992a0b59ea3ccee0ae833"
],
"markers": "python_full_version >= '3.6.1'",
"version": "==1.8.2"
},
"pyexiftool": {
"git": "https://github.com/smarnach/pyexiftool.git",
"ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f"
},
"pyparsing": {
"hashes": [
"sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb",
"sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"
],
"markers": "python_full_version >= '3.6.8'",
"version": "==3.0.9"
},
"pysocks": {
"hashes": [
"sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299",
"sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5",
"sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"
],
"version": "==1.7.1"
},
"pytesseract": {
"hashes": [
"sha256:7e2bafc7f48d1bb71443ce4633a56f5e21925a98f220a36c336297edcd1956d0",
@@ -878,7 +814,6 @@
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
"sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
"version": "==2.8.2"
},
"pytz": {
@@ -894,7 +829,6 @@
"sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6",
"sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==0.1.0.post0"
},
"ratelimit": {
@@ -981,23 +915,21 @@
"sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d",
"sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4"
],
"markers": "python_version >= '3.6'",
"version": "==2022.3.2"
},
"requests": {
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"
],
"index": "pypi",
"version": "==2.27.1"
"version": "==2.28.0"
},
"requests-oauthlib": {
"hashes": [
"sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5",
"sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.3.1"
},
"rsa": {
@@ -1005,7 +937,6 @@
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
],
"markers": "python_version >= '3.6' and python_version < '4'",
"version": "==4.8"
},
"s3transfer": {
@@ -1013,23 +944,13 @@
"sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd",
"sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"
],
"markers": "python_version >= '3.7'",
"version": "==0.6.0"
},
"setuptools": {
"hashes": [
"sha256:68e45d17c9281ba25dc0104eadd2647172b3472d9e01f911efa57965e8d51a36",
"sha256:a43bdedf853c670e5fed28e5623403bad2f73cf02f9a2774e91def6bda8265a7"
],
"markers": "python_version >= '3.7'",
"version": "==62.3.2"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
"version": "==1.16.0"
},
"smart-open": {
@@ -1037,19 +958,16 @@
"sha256:71d14489da58b60ce12fc3ecb823facc59a8b23cd1b58edb97175640350d3a62",
"sha256:75abf758717a92a8f53aa96953f0c245c8cedf8e1e4184903db3659b419d4c17"
],
"markers": "python_version >= '3.6' and python_version < '4'",
"version": "==5.2.1"
},
"snscrape": {
"git": "https://github.com/bellingcat/snscrape",
"ref": "0822a9c3548c4d0736a98f617d823b8475d24fda"
"git": "https://github.com/bellingcat/snscrape"
},
"soupsieve": {
"hashes": [
"sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
"sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
],
"markers": "python_version >= '3.6'",
"version": "==2.3.2.post1"
},
"spacy": {
@@ -1079,7 +997,6 @@
"sha256:4f7dcbc4e6c8e8cb4eadbb009f9c0a1a2a67442e0032c8d6776c9470c3759903",
"sha256:dfd58b0cc65b3596cb06f7b95e7bf4fff34668297c59eb179eb050db07b199df"
],
"markers": "python_version >= '3.6'",
"version": "==3.0.9"
},
"spacy-loggers": {
@@ -1087,7 +1004,6 @@
"sha256:d48c9313a577ad1818da961cf6db71a73fd1e556ae47e6e68d7e28b541d11e18",
"sha256:e75d44f4cf99e6763d7132ca7c8c420e0a92790222a08bc8eb9e24ea2c13536e"
],
"markers": "python_version >= '3.6'",
"version": "==1.0.2"
},
"sqlalchemy": {
@@ -1101,6 +1017,7 @@
"sha256:3197441772dc3b1c6419f13304402f2418a18d7fe78000aa5a026e7100836739",
"sha256:3688f92c62db6c5df268e2264891078f17ecb91e3141b400f2e28d0f75796dea",
"sha256:3862a069a24f354145e01a76c7c720c263d62405fe5bed038c46a7ce900f5dd6",
"sha256:4a17c1a1152ca4c29d992714aa9df3054da3af1598e02134f2e7314a32ef69d8",
"sha256:4c1d9fb3931e27d59166bb5c4dcc911400fee51082cfba66ceb19ac954ade068",
"sha256:4e8706919829d455a9fa687c6bbd1b048e36fec3919a59f2d366247c2bfdbd9c",
"sha256:50c8eaf44c3fed5ba6758d375de25f163e46137c39fda3a72b9ee1d1bb327dfc",
@@ -1153,7 +1070,6 @@
"sha256:f96af9fde9f58d5923091fa723fa0fed58a83781b98e143a5d1fac5e738b9f0d",
"sha256:fb08416fd6ef04c51fdeefd6d28592b64563b2853243c571a9b0d67403b5be7f"
],
"markers": "python_version >= '3.6'",
"version": "==2.4.3"
},
"telethon": {
@@ -1186,7 +1102,6 @@
"sha256:eba973fe229e7fa86b99f2c5e2724f7f19040ac75a8ef7c8b23b434dac1eadea",
"sha256:fd2d49a80a6c95be4eb0f8370a22eef903ecad10b65762d39c9b192abf905f7c"
],
"markers": "python_version >= '3.6'",
"version": "==8.0.17"
},
"tqdm": {
@@ -1202,7 +1117,6 @@
"sha256:5646aef0d936b2c761a10393f0384ee6b5c7fe0bb3e5cd710b17134ca1d99cff",
"sha256:e8467f0ebac0c81366c2168d6ad9f888efdfb6d4e1d3d5b4a004f46fa444b5c3"
],
"markers": "python_version >= '3.6'",
"version": "==0.4.1"
},
"typing-extensions": {
@@ -1210,7 +1124,6 @@
"sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708",
"sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376"
],
"markers": "python_version >= '3.7'",
"version": "==4.2.0"
},
"tzdata": {
@@ -1226,7 +1139,6 @@
"sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745",
"sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"
],
"markers": "python_version >= '3.6'",
"version": "==4.2"
},
"urllib3": {
@@ -1234,7 +1146,6 @@
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.9"
},
"wasabi": {
@@ -1295,7 +1206,6 @@
"sha256:fab7c640815812ed5f10fbee7abbf58788d602046b7bb3af9b1ac753a6d5e916",
"sha256:fc06cc8073c8e87072138ba1e431300e2d408f054b27047d047b549455066ff4"
],
"markers": "python_version >= '3.7'",
"version": "==10.3"
},
"yt-dlp": {
@@ -1320,7 +1230,6 @@
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
"sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==21.4.0"
},
"babel": {
@@ -1328,7 +1237,6 @@
"sha256:3f349e85ad3154559ac4930c3918247d319f21910d5ce4b25d439ed8693b98d2",
"sha256:98aeaca086133efb3e1e2aad0396987490c8425929ddbcfe0550184fdc54cd13"
],
"markers": "python_version >= '3.6'",
"version": "==2.10.1"
},
"black": {
@@ -1365,7 +1273,6 @@
"sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7",
"sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a"
],
"markers": "python_version >= '3.6'",
"version": "==2022.5.18.1"
},
"charset-normalizer": {
@@ -1373,7 +1280,6 @@
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
],
"markers": "python_version >= '3'",
"version": "==2.0.12"
},
"click": {
@@ -1381,13 +1287,9 @@
"sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1",
"sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb"
],
"markers": "python_version >= '3.6'",
"version": "==8.0.4"
},
"coverage": {
"extras": [
"toml"
],
"hashes": [
"sha256:01c5615d13f3dd3aa8543afc069e5319cfa0c7d712f6e04b920431e5c564a749",
"sha256:106c16dfe494de3193ec55cac9640dd039b66e196e4641fa8ac396181578b982",
@@ -1431,7 +1333,6 @@
"sha256:fdb6f7bd51c2d1714cea40718f6149ad9be6a2ee7d93b19e9f00934c0f2a74d9",
"sha256:ffa9297c3a453fba4717d06df579af42ab9a28022444cae7fa605af4df612d54"
],
"markers": "python_version >= '3.7'",
"version": "==6.4.1"
},
"docutils": {
@@ -1439,7 +1340,6 @@
"sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125",
"sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==0.17.1"
},
"idna": {
@@ -1447,7 +1347,6 @@
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
],
"markers": "python_version >= '3'",
"version": "==3.3"
},
"imagesize": {
@@ -1455,7 +1354,6 @@
"sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c",
"sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.3.0"
},
"importlib-metadata": {
@@ -1478,7 +1376,6 @@
"sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852",
"sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"
],
"markers": "python_version >= '3.7'",
"version": "==3.1.2"
},
"markupsafe": {
@@ -1524,7 +1421,6 @@
"sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
"sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
],
"markers": "python_version >= '3.7'",
"version": "==2.1.1"
},
"mypy-extensions": {
@@ -1539,7 +1435,6 @@
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
"sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"
],
"markers": "python_version >= '3.6'",
"version": "==21.3"
},
"pathspec": {
@@ -1554,7 +1449,6 @@
"sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788",
"sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"
],
"markers": "python_version >= '3.7'",
"version": "==2.5.2"
},
"pluggy": {
@@ -1562,7 +1456,6 @@
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
"sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
],
"markers": "python_version >= '3.6'",
"version": "==1.0.0"
},
"py": {
@@ -1570,7 +1463,6 @@
"sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
"sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.11.0"
},
"pygments": {
@@ -1578,7 +1470,6 @@
"sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb",
"sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519"
],
"markers": "python_version >= '3.6'",
"version": "==2.12.0"
},
"pyparsing": {
@@ -1586,7 +1477,6 @@
"sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb",
"sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"
],
"markers": "python_full_version >= '3.6.8'",
"version": "==3.0.9"
},
"pytest": {
@@ -1631,11 +1521,11 @@
},
"requests": {
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"
],
"index": "pypi",
"version": "==2.27.1"
"version": "==2.28.0"
},
"snowballstemmer": {
"hashes": [
@@ -1665,7 +1555,6 @@
"sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a",
"sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-devhelp": {
@@ -1673,7 +1562,6 @@
"sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e",
"sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-htmlhelp": {
@@ -1681,7 +1569,6 @@
"sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07",
"sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"
],
"markers": "python_version >= '3.6'",
"version": "==2.0.0"
},
"sphinxcontrib-jsmath": {
@@ -1689,7 +1576,6 @@
"sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178",
"sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.1"
},
"sphinxcontrib-qthelp": {
@@ -1697,7 +1583,6 @@
"sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72",
"sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.3"
},
"sphinxcontrib-serializinghtml": {
@@ -1705,7 +1590,6 @@
"sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd",
"sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"
],
"markers": "python_version >= '3.5'",
"version": "==1.1.5"
},
"tomli": {
@@ -1713,7 +1597,6 @@
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
],
"markers": "python_version >= '3.7'",
"version": "==2.0.1"
},
"typing-extensions": {
@@ -1721,7 +1604,6 @@
"sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708",
"sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376"
],
"markers": "python_version >= '3.7'",
"version": "==4.2.0"
},
"urllib3": {
@@ -1729,7 +1611,6 @@
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.9"
},
"zipp": {
@@ -1737,7 +1618,6 @@
"sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad",
"sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099"
],
"markers": "python_version >= '3.7'",
"version": "==3.8.0"
}
}

View File

@@ -237,12 +237,12 @@ class Post:
# replace is here in order to prevent catastrophic backtracking
urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
self.outlinks = urls
self.outlinks += urls
HASHTAG_REGEX = r"(?:^|\s)[#]{1}(\w+)"
hashtags = re.findall(HASHTAG_REGEX, self.content)
self.hashtags = hashtags
self.hashtags += hashtags
# regex patterns for finding crypto addresses
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'

View File

@@ -421,6 +421,9 @@ class ScraperController:
session.commit()
added += 1
profile = scraper.get_profile(channel)
session.add(profile)
session.commit()
logger.info(
f"{scraper} found {added} new posts from {channel}")

View File

@@ -5,6 +5,7 @@ from html.parser import HTMLParser
import dateparser
import json
from typing import Generator
from dateutil.relativedelta import relativedelta
import requests
from bs4 import BeautifulSoup
@@ -70,7 +71,7 @@ class BitchuteScraper(Scraper):
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
return True
@logger.catch
@logger.catch(reraise = True)
def get_profile(self, channel: Channel) -> RawChannelInfo:
base_url = channel.url
@@ -104,7 +105,7 @@ class BitchuteScraper(Scraper):
profile = {
'description' : description_soup.text.strip(),
'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
'created': parse_created(re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. '))),
'videos' : int(info_list[1].text.split('videos')[0].strip()),
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
'owner_name' : owner_name,
@@ -116,7 +117,7 @@ class BitchuteScraper(Scraper):
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
raw_data=json.dumps(profile, default = str),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@@ -484,4 +485,14 @@ def decode_cfemail(cfemail):
return email
#---------------------------------------------------------------------------#
#---------------------------------------------------------------------------#
def parse_created(created):
period_list = ['year', 'month', 'week', 'day']
periods = [period.strip() for period in created.split('ago')[0].strip().split(',')]
_kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()}
return datetime.now() - relativedelta(**kwargs)

View File

@@ -14,7 +14,7 @@ BASE_URL = 'https://rumble.com'
class RumbleScraper(Scraper):
"""An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.1"
__version__ = "RumbleScraper 0.0.2"
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
cookiefilename = 'cookiefile.txt'
@@ -105,20 +105,38 @@ def process_video(video):
views = None
else:
views = view_span.get('data-value')
author_a = video.find('a', {'rel': 'author'})
if author_a is None:
author_id = None
author_name = None
else:
author_id = author_a['href'].split('/')[-1]
author_name = author_a.text
video_link = BASE_URL + video.find('a', href = True)['href']
r = make_request(url = video_link)
soup = BeautifulSoup(r.content, features = 'html.parser')
content_div = soup.find('div', {'class': 'container content media-description'})
info = {
'title' : video.find('h3').text,
'thumbnail' : video.find('img')['src'],
'link' : BASE_URL + video.find('a', href = True)['href'],
'link' : video_link,
'views' : views,
'rumbles' : rumbles,
'content': '' if content_div is None else content_div.get_text('\n'),
'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'],
'datetime' : datetime.fromisoformat(video.find('time')['datetime'])}
'datetime' : datetime.fromisoformat(video.find('time')['datetime']),
'author_id': author_id,
'author_name': author_name}
info['media_url'] = get_media_url(info['link'])
return info
def get_channel_videos(url):
page = 1
@@ -150,8 +168,15 @@ def get_channel_profile(url):
thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'})
cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'})
author_a = soup.find('a', {'rel': 'author'})
if author_a is None:
author_id = None
else:
author_id = author_a['href'].split('/')[-1]
profile = {
'name': soup.find('h1').text,
'id': author_id,
'verified': verified_svg is not None,
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
'cover': cover_soup.get('src') if cover_soup else None,

View File

@@ -1,4 +1,6 @@
from .base import ETLController
from .twitter import TwitterTransformer
from .bitchute import BitchuteTransformer
from .telegram_telethon import TelegramTelethonTransformer
from .telegram_telethon import TelegramTelethonTransformer
from .rumble import RumbleTransformer
from .gettr import GettrTransformer

View File

@@ -1,11 +1,13 @@
import json
from loguru import logger
from typing import Generator
from typing import Generator, Union, Callable
from datetime import datetime, timezone
import dateutil.parser
from bs4 import BeautifulSoup
from cisticola.transformer.base import Transformer
from cisticola.base import ScraperResult, Post, Image, Video, Media
from cisticola.base import RawChannelInfo, ScraperResult, Post, Image, Video, Media, Channel, ChannelInfo
class BitchuteTransformer(Transformer):
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
@@ -19,7 +21,7 @@ class BitchuteTransformer(Transformer):
return False
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
def transform_media(self, data: ScraperResult, insert: Callable, transformed: Post) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data)
orig = raw['video_url']
@@ -27,9 +29,34 @@ class BitchuteTransformer(Transformer):
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
yield m
insert(m)
def transform(self, data: ScraperResult) -> Post:
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['owner_url'].strip('/').split('/')[-1],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['owner_name'],
name=raw['owner_name'],
description=raw['description'],
description_url='', # does not exist for Bitchute
description_location='', # does not exist for Bitchute
followers=raw['subscribers'],
following=-1, # does not exist for Bitchute
verified=False, # does not exist for Bitchute
date_created=dateutil.parser.parse(raw['created']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
soup = BeautifulSoup(raw['body'], features = 'html.parser')
@@ -37,15 +64,17 @@ class BitchuteTransformer(Transformer):
transformed = Post(
raw_id=data.id,
platform_id=raw['id'],
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=data.date,
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=raw['url'],
content=content,
author_id=raw['author_id'],
author_username=raw['author'])
return transformed
transformed = insert(transformed)

View File

@@ -0,0 +1,78 @@
import json
from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
class GettrTransformer(Transformer):
"""A Gettr specific ScraperResult, with a method ETL/transforming"""
__version__ = "GettrTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
if scraper[0] == "GettrScraper":
return True
return False
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['_id'],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['username'],
name=raw['nickname'],
description=raw['dsc'],
description_url=raw['website'],
description_location=raw['location'],
followers=raw['flg'],
following=raw['flw'],
verified=True if raw.get('infl') else False,
date_created=datetime.fromtimestamp(raw['cdate']*0.001),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
if raw["activity"]["action"] == "shares_pst":
forwarded_from = raw["activity"]["uid"]
else:
forwarded_from = None
transformed = Post(
raw_id=data.id,
platform_id=raw["_id"],
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=datetime.fromtimestamp(raw["activity"]["cdate"] / 1000.0),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url="https://www.gettr.com/post/" + raw["_id"],
content=raw.get("txt", ""),
author_id=raw["receiver_id"],
author_username=raw["uid"],
hashtags=raw.get("htgs", []),
outlinks = list(filter(None, [raw.get("prevsrc")])),
forwarded_from = forwarded_from)
insert(transformed)
# media = self.process_media(raw, transformed.id, data)
# for m in media:
# insert(m)

View File

@@ -0,0 +1,70 @@
import json
from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
class RumbleTransformer(Transformer):
"""A Rumble specific ScraperResult, with a method ETL/transforming"""
__version__ = "RumbleTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
if scraper[0] == "RumbleScraper":
return True
return False
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['id'],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['id'],
name=raw['name'],
description='', # does not exist for Rumble
description_url='', # does not exist for Rumble
description_location='', # does not exist for Rumble
followers=raw['subscribers'],
following=-1, # does not exist for Rumble
verified=raw['verified'],
date_created=None, # does not exist for Rumble
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = Post(
raw_id=data.id,
platform_id=raw['media_url'].strip('/').split('/')[-1],
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=dateutil.parser.parse(raw['datetime']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=raw['link'],
content=raw['content'],
author_id=raw['author_id'],
author_username=raw['author_name'])
insert(transformed)
# media = self.process_media(raw, transformed.id, data)
# for m in media:
# insert(m)

View File

@@ -61,7 +61,7 @@ class TelegramTelethonTransformer(Transformer):
self.bad_channels[orig_screenname] = True
return ""
soup = BeautifulSoup(r.content)
soup = BeautifulSoup(r.content, features = 'lxml')
post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id)})
name = ""
@@ -181,14 +181,14 @@ class TelegramTelethonTransformer(Transformer):
transformed = insert(transformed)
# for k in data.archived_urls:
# if data.archived_urls[k]:
# archived_url = data.archived_urls[k]
# ext = archived_url.split('.')[-1]
for k in data.archived_urls:
if data.archived_urls[k]:
archived_url = data.archived_urls[k]
ext = archived_url.split('.')[-1]
# if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
# insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
# else:
# insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
else:
insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))

View File

@@ -2,9 +2,10 @@ import json
from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from cisticola.transformer.base import Transformer
from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
class TwitterTransformer(Transformer):
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
@@ -45,8 +46,33 @@ class TwitterTransformer(Transformer):
yield m
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['id'],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['username'],
name=raw['displayname'],
description=raw['rawDescription'],
description_url=raw['linkUrl'],
description_location=raw['location'],
followers=raw['followersCount'],
following=raw['friendsCount'],
verified=raw['verified'],
date_created=dateutil.parser.parse(raw['created']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = Post(
@@ -58,6 +84,7 @@ class TwitterTransformer(Transformer):
channel=data.channel,
date=dateutil.parser.parse(raw['date']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=raw['url'],
content=raw['content'],
author_id=raw['user']['id'],
@@ -85,6 +112,7 @@ class TwitterTransformer(Transformer):
channel=channel.id,
date=dateutil.parser.parse(tweet['date']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=tweet['url'],
content=tweet['content'],
author_id=tweet['user']['id'],
@@ -109,7 +137,4 @@ class TwitterTransformer(Transformer):
media = self.process_media(raw, transformed.id, data)
for m in media:
insert(m)
insert(m)

View File

@@ -0,0 +1,34 @@
from sqlalchemy.orm import sessionmaker
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import BitchuteScraper
from cisticola.transformer import BitchuteTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_bitchute(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(scraper = BitchuteScraper())
controller.scrape_channels(channels = channels, archive_media = True)
etl_controller.register_transformer(BitchuteTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 5
# assert len(media) == 0
assert 'Pendant are something that the advanced ladies can fuse in her every day look' in posts[0].content
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"

View File

@@ -0,0 +1,34 @@
from sqlalchemy.orm import sessionmaker
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import GettrScraper
from cisticola.transformer import GettrTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_gettr(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, archive_media = True)
etl_controller.register_transformer(GettrTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 23
# assert len(media) == 0
assert 'Nigerian gender studies' in posts[-1].content
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"

View File

@@ -0,0 +1,34 @@
from sqlalchemy.orm import sessionmaker
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import RumbleScraper
from cisticola.transformer import RumbleTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_rumble(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, archive_media = True)
etl_controller.register_transformer(RumbleTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 7
# assert len(media) == 0
assert '#whitegold #icedoutcuban' in posts[0].content
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"

View File

@@ -0,0 +1,34 @@
from sqlalchemy.orm import sessionmaker, with_polymorphic
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import TelegramTelethonScraper
from cisticola.transformer import TelegramTelethonTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, archive_media = True)
etl_controller.register_transformer(TelegramTelethonTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 19
assert len(media) == 13
assert posts[16].content == "Taking pre-orders now"
assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280"

View File

@@ -1,4 +1,4 @@
from sqlalchemy.orm import sessionmaker, with_polymorphic
from sqlalchemy.orm import sessionmaker
import json
import pytest
@@ -18,6 +18,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
etl_controller.register_transformer(TwitterTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
@@ -26,8 +27,8 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 10
assert len(media) == 7
assert len(posts) == 12
assert len(media) == 4
assert posts[-1].content == "BARN"
assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728"
assert posts[2].content == "BARN"
assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"