mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-11 12:58:33 +03:00
Merge branch 'more-channel-info-transformers' of https://github.com/bellingcat/cisticola into main
This commit is contained in:
1
Pipfile
1
Pipfile
@@ -9,6 +9,7 @@ loguru = "*"
|
||||
gogettr = "*"
|
||||
requests = "*"
|
||||
bs4 = "*"
|
||||
lxml = "*"
|
||||
dateparser = "*"
|
||||
boto3 = "*"
|
||||
ffmpeg-python = "*"
|
||||
|
||||
196
Pipfile.lock
generated
196
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "3f0312747083221d57ac2c4ce96786a6ead34aa3a3a3519fed4ea4382f672633"
|
||||
"sha256": "13cc50755a59b2cd8bf93049a9a695aa27d35b973b0bdc154af5d21ce48fd57f"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@@ -21,7 +21,6 @@
|
||||
"sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30",
|
||||
"sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==4.11.1"
|
||||
},
|
||||
"blis": {
|
||||
@@ -50,19 +49,18 @@
|
||||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:927b5e8e2decad746e6c32bb81f15c2ea9ab4398286134d21f6742493eb893f6",
|
||||
"sha256:e3c10adc7be890b147568a4162d9cafb876f11f87460c4a0dc90742d6d4ebe7c"
|
||||
"sha256:1c13d555172cf88eb645af2429e4a7f42be85e365d6ffc110c952a556d3f8808",
|
||||
"sha256:4af6a8bc5110b5f9d2fbd00a3c110e4c4cc36fae78d05afa354831f5789e363b"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.24.2"
|
||||
"version": "==1.24.6"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:131f71fe16ef84f9e0e72c54d2e230a6d8e79dd3947f507259a129649649a35d",
|
||||
"sha256:b7cdd4f4a6395a084a381a7d2a25b177e6de5f8a4dfa3c645ec957ba3c83e200"
|
||||
"sha256:97c909a6ec5ad421573c18ae67fc6ea4232502cd30cffaf03bfcb584d9df652d",
|
||||
"sha256:eeebe304161db6828413dc358ea80ece52f4ddbc8ecde4dd58978d5861a09293"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==1.27.2"
|
||||
"version": "==1.27.6"
|
||||
},
|
||||
"brotli": {
|
||||
"hashes": [
|
||||
@@ -144,7 +142,6 @@
|
||||
"sha256:6a94c6402995a99c3970cc7e4884bb60b4a8639938157eeed436098bf9831757",
|
||||
"sha256:f9f17d2aec496a9aa6b76f53e3b614c965223c061982d434d160f930c698a9db"
|
||||
],
|
||||
"markers": "python_version ~= '3.7'",
|
||||
"version": "==5.2.0"
|
||||
},
|
||||
"catalogue": {
|
||||
@@ -152,7 +149,6 @@
|
||||
"sha256:535d33ae79ebd21ca298551d85da186ae8b8e1df36b0fb0246da774163ec2d6b",
|
||||
"sha256:cab4feda641fe05da1e6a1a9d123b0869d5ca324dcd93d4a5c384408ab62e7fb"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.0.7"
|
||||
},
|
||||
"certifi": {
|
||||
@@ -160,7 +156,6 @@
|
||||
"sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7",
|
||||
"sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2022.5.18.1"
|
||||
},
|
||||
"charset-normalizer": {
|
||||
@@ -168,7 +163,6 @@
|
||||
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
|
||||
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==2.0.12"
|
||||
},
|
||||
"click": {
|
||||
@@ -176,7 +170,6 @@
|
||||
"sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1",
|
||||
"sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==8.0.4"
|
||||
},
|
||||
"cryptg": {
|
||||
@@ -261,24 +254,14 @@
|
||||
"index": "pypi",
|
||||
"version": "==0.2.0"
|
||||
},
|
||||
"filelock": {
|
||||
"hashes": [
|
||||
"sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404",
|
||||
"sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==3.7.1"
|
||||
},
|
||||
"future": {
|
||||
"hashes": [
|
||||
"sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'",
|
||||
"version": "==0.18.2"
|
||||
},
|
||||
"gabber": {
|
||||
"git": "https://github.com/stanfordio/gabber.git",
|
||||
"ref": "a032db8047fa6b762b2fc127b08ee37d6ad9e110"
|
||||
"git": "https://github.com/stanfordio/gabber.git"
|
||||
},
|
||||
"gogettr": {
|
||||
"hashes": [
|
||||
@@ -290,19 +273,17 @@
|
||||
},
|
||||
"google-auth": {
|
||||
"hashes": [
|
||||
"sha256:1ba4938e032b73deb51e59c4656a00e0939cf0b1112575099f136babb4563312",
|
||||
"sha256:349ac49b18b01019453cc99c11c92ed772739778c92f184002b7ab3a5b7ac77d"
|
||||
"sha256:8a954960f852d5f19e6af14dd8e75c20159609e85d8db37e4013cc8c3824a7e1",
|
||||
"sha256:df549a1433108801b11bdcc0e312eaf0d5f0500db42f0523e4d65c78722e8475"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
|
||||
"version": "==2.6.6"
|
||||
"version": "==2.7.0"
|
||||
},
|
||||
"google-auth-oauthlib": {
|
||||
"hashes": [
|
||||
"sha256:24f67735513c4c7134dbde2f1dee5a1deb6acc8dfcb577d7bff30d213a28e7b0",
|
||||
"sha256:30596b824fc6808fdaca2f048e4998cc40fb4b3599eaea66d28dc7085b36c5b8"
|
||||
"sha256:6d6161d0ec0a62e2abf2207c6071c117ec5897b300823c4bb2d963ee86e20e4f",
|
||||
"sha256:d5e98a71203330699f92a26bc08847a92e8c3b1b8d82a021f1af34164db143ae"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==0.5.1"
|
||||
"version": "==0.5.2"
|
||||
},
|
||||
"greenlet": {
|
||||
"hashes": [
|
||||
@@ -377,22 +358,20 @@
|
||||
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
|
||||
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==3.3"
|
||||
},
|
||||
"instaloader": {
|
||||
"hashes": [
|
||||
"sha256:7fa6147810eedcc1dedcdec8cfa1f220c9379ab8faeab6a336a7c181d944e2e4"
|
||||
"sha256:77d4a140aafd1a9f48765db1f5ede9b74136eda67f428bfc392d7440b26ae74c"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.9"
|
||||
"version": "==4.9.1"
|
||||
},
|
||||
"jinja2": {
|
||||
"hashes": [
|
||||
"sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852",
|
||||
"sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==3.1.2"
|
||||
},
|
||||
"jmespath": {
|
||||
@@ -400,7 +379,6 @@
|
||||
"sha256:a490e280edd1f57d6de88636992d05b71e97d69a26a19f058ecf7d304474bf5e",
|
||||
"sha256:e8dcd576ed616f14ec02eed0005c85973b5890083313860136657e24784e4c04"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==1.0.0"
|
||||
},
|
||||
"langcodes": {
|
||||
@@ -408,7 +386,6 @@
|
||||
"sha256:4d89fc9acb6e9c8fdef70bcdf376113a3db09b67285d9e1d534de6d8818e7e69",
|
||||
"sha256:794d07d5a28781231ac335a1561b8442f8648ca07cd518310aeb45d6f0807ef6"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.3.0"
|
||||
},
|
||||
"langdetect": {
|
||||
@@ -421,11 +398,11 @@
|
||||
},
|
||||
"loguru": {
|
||||
"hashes": [
|
||||
"sha256:b28e72ac7a98be3d28ad28570299a393dfcd32e5e3f6a353dec94675767b6319",
|
||||
"sha256:f8087ac396b5ee5f67c963b495d615ebbceac2796379599820e324419d53667c"
|
||||
"sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c",
|
||||
"sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.5.3"
|
||||
"version": "==0.6.0"
|
||||
},
|
||||
"lxml": {
|
||||
"hashes": [
|
||||
@@ -493,7 +470,7 @@
|
||||
"sha256:f6d23a01921b741774f35e924d418a43cf03eca1444f3fdfd7978d35a5aaab8b",
|
||||
"sha256:fcdf70191f0d1761d190a436db06a46f05af60e1410e1507935f0332280c9268"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"index": "pypi",
|
||||
"version": "==4.9.0"
|
||||
},
|
||||
"markupsafe": {
|
||||
@@ -539,7 +516,6 @@
|
||||
"sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
|
||||
"sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.1.1"
|
||||
},
|
||||
"murmurhash": {
|
||||
@@ -571,7 +547,6 @@
|
||||
"sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1",
|
||||
"sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed"
|
||||
],
|
||||
"markers": "python_version >= '3.5' and python_version < '4'",
|
||||
"version": "==1.45.1"
|
||||
},
|
||||
"numpy": {
|
||||
@@ -599,7 +574,6 @@
|
||||
"sha256:f5a1c7c45ff29db501f9e38a360aedd833e355c14c75155ba2bd46ee3799e30a",
|
||||
"sha256:fde47931544086a648b12ee7c9ccf30edd6c6db776005fb07e4a019a04980042"
|
||||
],
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==1.23.0rc2"
|
||||
},
|
||||
"oauthlib": {
|
||||
@@ -607,50 +581,29 @@
|
||||
"sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
|
||||
"sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.2.0"
|
||||
},
|
||||
"ocrd-pyexiftool": {
|
||||
"hashes": [
|
||||
"sha256:13d7aeabd765256e7640e4198cc742538a9c458b34aca6644b356c6e908c922a",
|
||||
"sha256:457a432d167174e93f63a487879ea767b0ff54aef539e40586fffe5fb9050461",
|
||||
"sha256:9c77e753769857657069de76d2c4b592efbd99db3974a76df561fd0ca75cec0e"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.2.0"
|
||||
},
|
||||
"packaging": {
|
||||
"hashes": [
|
||||
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
|
||||
"sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==21.3"
|
||||
},
|
||||
"pandas": {
|
||||
"hashes": [
|
||||
"sha256:0010771bd9223f7afe5f051eb47c4a49534345dfa144f2f5470b27189a4dd3b5",
|
||||
"sha256:061609334a8182ab500a90fe66d46f6f387de62d3a9cb9aa7e62e3146c712167",
|
||||
"sha256:09d8be7dd9e1c4c98224c4dfe8abd60d145d934e9fc1f5f411266308ae683e6a",
|
||||
"sha256:295872bf1a09758aba199992c3ecde455f01caf32266d50abc1a073e828a7b9d",
|
||||
"sha256:3228198333dd13c90b6434ddf61aa6d57deaca98cf7b654f4ad68a2db84f8cfe",
|
||||
"sha256:385c52e85aaa8ea6a4c600a9b2821181a51f8be0aee3af6f2dcb41dafc4fc1d0",
|
||||
"sha256:51649ef604a945f781105a6d2ecf88db7da0f4868ac5d45c51cb66081c4d9c73",
|
||||
"sha256:5586cc95692564b441f4747c47c8a9746792e87b40a4680a2feb7794defb1ce3",
|
||||
"sha256:5a206afa84ed20e07603f50d22b5f0db3fb556486d8c2462d8bc364831a4b417",
|
||||
"sha256:5b79af3a69e5175c6fa7b4e046b21a646c8b74e92c6581a9d825687d92071b51",
|
||||
"sha256:5c54ea4ef3823108cd4ec7fb27ccba4c3a775e0f83e39c5e17f5094cb17748bc",
|
||||
"sha256:8c5bf555b6b0075294b73965adaafb39cf71c312e38c5935c93d78f41c19828a",
|
||||
"sha256:92bc1fc585f1463ca827b45535957815b7deb218c549b7c18402c322c7549a12",
|
||||
"sha256:95c1e422ced0199cf4a34385ff124b69412c4bc912011ce895582bee620dfcaa",
|
||||
"sha256:b8134651258bce418cb79c71adeff0a44090c98d955f6953168ba16cc285d9f7",
|
||||
"sha256:be67c782c4f1b1f24c2f16a157e12c2693fd510f8df18e3287c77f33d124ed07",
|
||||
"sha256:c072c7f06b9242c855ed8021ff970c0e8f8b10b35e2640c657d2a541c5950f59",
|
||||
"sha256:d0d4f13e4be7ce89d7057a786023c461dd9370040bdb5efa0a7fe76b556867a0",
|
||||
"sha256:df82739e00bb6daf4bba4479a40f38c718b598a84654cbd8bb498fd6b0aa8c16",
|
||||
"sha256:f549097993744ff8c41b5e8f2f0d3cbfaabe89b4ae32c8c08ead6cc535b80139",
|
||||
"sha256:ff08a14ef21d94cdf18eef7c569d66f2e24e0bc89350bcd7d243dd804e3b5eb2"
|
||||
],
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==1.4.2"
|
||||
},
|
||||
"pathy": {
|
||||
"hashes": [
|
||||
"sha256:25fd04cec6393661113086730ce69c789d121bea83ab1aa18452e8fd42faf29a",
|
||||
"sha256:838624441f799a06b446a657e4ecc9ebc3fdd05234397e044a7c87e8f6e76b1c"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==0.6.1"
|
||||
},
|
||||
"pillow": {
|
||||
@@ -694,12 +647,10 @@
|
||||
"sha256:f3f6a6034140e9e17e9abc175fc7a266a6e63652028e157750bd98e804a8ed9a",
|
||||
"sha256:ffde4c6fabb52891d81606411cbfaf77756e3b561b566efd270b3ed3791fde4e"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==9.1.1"
|
||||
},
|
||||
"polyphemus": {
|
||||
"git": "https://github.com/bellingcat/polyphemus",
|
||||
"ref": "b18e5591fa4f903e5506742c2e3f17d45bb88755"
|
||||
"git": "https://github.com/bellingcat/polyphemus"
|
||||
},
|
||||
"preshed": {
|
||||
"hashes": [
|
||||
@@ -814,7 +765,6 @@
|
||||
"sha256:f75009715dcf4a3d680c2338ab19dac5498f8121173a929872950f4fb3a48fbf",
|
||||
"sha256:f8524b8bc89470cec7ac51734907818d3620fb1637f8f8b542d650ebec42a126"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==3.14.1"
|
||||
},
|
||||
"pydantic": {
|
||||
@@ -842,29 +792,15 @@
|
||||
"sha256:ea5cb40a3b23b3265f6325727ddfc45141b08ed665458be8c6285e7b85bd73a1",
|
||||
"sha256:fec866a0b59f372b7e776f2d7308511784dace622e0992a0b59ea3ccee0ae833"
|
||||
],
|
||||
"markers": "python_full_version >= '3.6.1'",
|
||||
"version": "==1.8.2"
|
||||
},
|
||||
"pyexiftool": {
|
||||
"git": "https://github.com/smarnach/pyexiftool.git",
|
||||
"ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f"
|
||||
},
|
||||
"pyparsing": {
|
||||
"hashes": [
|
||||
"sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb",
|
||||
"sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"
|
||||
],
|
||||
"markers": "python_full_version >= '3.6.8'",
|
||||
"version": "==3.0.9"
|
||||
},
|
||||
"pysocks": {
|
||||
"hashes": [
|
||||
"sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299",
|
||||
"sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5",
|
||||
"sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"
|
||||
],
|
||||
"version": "==1.7.1"
|
||||
},
|
||||
"pytesseract": {
|
||||
"hashes": [
|
||||
"sha256:7e2bafc7f48d1bb71443ce4633a56f5e21925a98f220a36c336297edcd1956d0",
|
||||
@@ -878,7 +814,6 @@
|
||||
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
|
||||
"sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
|
||||
"version": "==2.8.2"
|
||||
},
|
||||
"pytz": {
|
||||
@@ -894,7 +829,6 @@
|
||||
"sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6",
|
||||
"sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
|
||||
"version": "==0.1.0.post0"
|
||||
},
|
||||
"ratelimit": {
|
||||
@@ -981,23 +915,21 @@
|
||||
"sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d",
|
||||
"sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2022.3.2"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
|
||||
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.27.1"
|
||||
"version": "==2.28.0"
|
||||
},
|
||||
"requests-oauthlib": {
|
||||
"hashes": [
|
||||
"sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5",
|
||||
"sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.3.1"
|
||||
},
|
||||
"rsa": {
|
||||
@@ -1005,7 +937,6 @@
|
||||
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
|
||||
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
|
||||
],
|
||||
"markers": "python_version >= '3.6' and python_version < '4'",
|
||||
"version": "==4.8"
|
||||
},
|
||||
"s3transfer": {
|
||||
@@ -1013,23 +944,13 @@
|
||||
"sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd",
|
||||
"sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==0.6.0"
|
||||
},
|
||||
"setuptools": {
|
||||
"hashes": [
|
||||
"sha256:68e45d17c9281ba25dc0104eadd2647172b3472d9e01f911efa57965e8d51a36",
|
||||
"sha256:a43bdedf853c670e5fed28e5623403bad2f73cf02f9a2774e91def6bda8265a7"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==62.3.2"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
|
||||
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'",
|
||||
"version": "==1.16.0"
|
||||
},
|
||||
"smart-open": {
|
||||
@@ -1037,19 +958,16 @@
|
||||
"sha256:71d14489da58b60ce12fc3ecb823facc59a8b23cd1b58edb97175640350d3a62",
|
||||
"sha256:75abf758717a92a8f53aa96953f0c245c8cedf8e1e4184903db3659b419d4c17"
|
||||
],
|
||||
"markers": "python_version >= '3.6' and python_version < '4'",
|
||||
"version": "==5.2.1"
|
||||
},
|
||||
"snscrape": {
|
||||
"git": "https://github.com/bellingcat/snscrape",
|
||||
"ref": "0822a9c3548c4d0736a98f617d823b8475d24fda"
|
||||
"git": "https://github.com/bellingcat/snscrape"
|
||||
},
|
||||
"soupsieve": {
|
||||
"hashes": [
|
||||
"sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759",
|
||||
"sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.3.2.post1"
|
||||
},
|
||||
"spacy": {
|
||||
@@ -1079,7 +997,6 @@
|
||||
"sha256:4f7dcbc4e6c8e8cb4eadbb009f9c0a1a2a67442e0032c8d6776c9470c3759903",
|
||||
"sha256:dfd58b0cc65b3596cb06f7b95e7bf4fff34668297c59eb179eb050db07b199df"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.0.9"
|
||||
},
|
||||
"spacy-loggers": {
|
||||
@@ -1087,7 +1004,6 @@
|
||||
"sha256:d48c9313a577ad1818da961cf6db71a73fd1e556ae47e6e68d7e28b541d11e18",
|
||||
"sha256:e75d44f4cf99e6763d7132ca7c8c420e0a92790222a08bc8eb9e24ea2c13536e"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.0.2"
|
||||
},
|
||||
"sqlalchemy": {
|
||||
@@ -1101,6 +1017,7 @@
|
||||
"sha256:3197441772dc3b1c6419f13304402f2418a18d7fe78000aa5a026e7100836739",
|
||||
"sha256:3688f92c62db6c5df268e2264891078f17ecb91e3141b400f2e28d0f75796dea",
|
||||
"sha256:3862a069a24f354145e01a76c7c720c263d62405fe5bed038c46a7ce900f5dd6",
|
||||
"sha256:4a17c1a1152ca4c29d992714aa9df3054da3af1598e02134f2e7314a32ef69d8",
|
||||
"sha256:4c1d9fb3931e27d59166bb5c4dcc911400fee51082cfba66ceb19ac954ade068",
|
||||
"sha256:4e8706919829d455a9fa687c6bbd1b048e36fec3919a59f2d366247c2bfdbd9c",
|
||||
"sha256:50c8eaf44c3fed5ba6758d375de25f163e46137c39fda3a72b9ee1d1bb327dfc",
|
||||
@@ -1153,7 +1070,6 @@
|
||||
"sha256:f96af9fde9f58d5923091fa723fa0fed58a83781b98e143a5d1fac5e738b9f0d",
|
||||
"sha256:fb08416fd6ef04c51fdeefd6d28592b64563b2853243c571a9b0d67403b5be7f"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.4.3"
|
||||
},
|
||||
"telethon": {
|
||||
@@ -1186,7 +1102,6 @@
|
||||
"sha256:eba973fe229e7fa86b99f2c5e2724f7f19040ac75a8ef7c8b23b434dac1eadea",
|
||||
"sha256:fd2d49a80a6c95be4eb0f8370a22eef903ecad10b65762d39c9b192abf905f7c"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==8.0.17"
|
||||
},
|
||||
"tqdm": {
|
||||
@@ -1202,7 +1117,6 @@
|
||||
"sha256:5646aef0d936b2c761a10393f0384ee6b5c7fe0bb3e5cd710b17134ca1d99cff",
|
||||
"sha256:e8467f0ebac0c81366c2168d6ad9f888efdfb6d4e1d3d5b4a004f46fa444b5c3"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==0.4.1"
|
||||
},
|
||||
"typing-extensions": {
|
||||
@@ -1210,7 +1124,6 @@
|
||||
"sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708",
|
||||
"sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==4.2.0"
|
||||
},
|
||||
"tzdata": {
|
||||
@@ -1226,7 +1139,6 @@
|
||||
"sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745",
|
||||
"sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==4.2"
|
||||
},
|
||||
"urllib3": {
|
||||
@@ -1234,7 +1146,6 @@
|
||||
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
|
||||
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"version": "==1.26.9"
|
||||
},
|
||||
"wasabi": {
|
||||
@@ -1295,7 +1206,6 @@
|
||||
"sha256:fab7c640815812ed5f10fbee7abbf58788d602046b7bb3af9b1ac753a6d5e916",
|
||||
"sha256:fc06cc8073c8e87072138ba1e431300e2d408f054b27047d047b549455066ff4"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==10.3"
|
||||
},
|
||||
"yt-dlp": {
|
||||
@@ -1320,7 +1230,6 @@
|
||||
"sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4",
|
||||
"sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==21.4.0"
|
||||
},
|
||||
"babel": {
|
||||
@@ -1328,7 +1237,6 @@
|
||||
"sha256:3f349e85ad3154559ac4930c3918247d319f21910d5ce4b25d439ed8693b98d2",
|
||||
"sha256:98aeaca086133efb3e1e2aad0396987490c8425929ddbcfe0550184fdc54cd13"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.10.1"
|
||||
},
|
||||
"black": {
|
||||
@@ -1365,7 +1273,6 @@
|
||||
"sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7",
|
||||
"sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2022.5.18.1"
|
||||
},
|
||||
"charset-normalizer": {
|
||||
@@ -1373,7 +1280,6 @@
|
||||
"sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597",
|
||||
"sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==2.0.12"
|
||||
},
|
||||
"click": {
|
||||
@@ -1381,13 +1287,9 @@
|
||||
"sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1",
|
||||
"sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==8.0.4"
|
||||
},
|
||||
"coverage": {
|
||||
"extras": [
|
||||
"toml"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:01c5615d13f3dd3aa8543afc069e5319cfa0c7d712f6e04b920431e5c564a749",
|
||||
"sha256:106c16dfe494de3193ec55cac9640dd039b66e196e4641fa8ac396181578b982",
|
||||
@@ -1431,7 +1333,6 @@
|
||||
"sha256:fdb6f7bd51c2d1714cea40718f6149ad9be6a2ee7d93b19e9f00934c0f2a74d9",
|
||||
"sha256:ffa9297c3a453fba4717d06df579af42ab9a28022444cae7fa605af4df612d54"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==6.4.1"
|
||||
},
|
||||
"docutils": {
|
||||
@@ -1439,7 +1340,6 @@
|
||||
"sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125",
|
||||
"sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==0.17.1"
|
||||
},
|
||||
"idna": {
|
||||
@@ -1447,7 +1347,6 @@
|
||||
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
|
||||
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==3.3"
|
||||
},
|
||||
"imagesize": {
|
||||
@@ -1455,7 +1354,6 @@
|
||||
"sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c",
|
||||
"sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.3.0"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
@@ -1478,7 +1376,6 @@
|
||||
"sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852",
|
||||
"sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==3.1.2"
|
||||
},
|
||||
"markupsafe": {
|
||||
@@ -1524,7 +1421,6 @@
|
||||
"sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
|
||||
"sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.1.1"
|
||||
},
|
||||
"mypy-extensions": {
|
||||
@@ -1539,7 +1435,6 @@
|
||||
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
|
||||
"sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==21.3"
|
||||
},
|
||||
"pathspec": {
|
||||
@@ -1554,7 +1449,6 @@
|
||||
"sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788",
|
||||
"sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.5.2"
|
||||
},
|
||||
"pluggy": {
|
||||
@@ -1562,7 +1456,6 @@
|
||||
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
|
||||
"sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.0.0"
|
||||
},
|
||||
"py": {
|
||||
@@ -1570,7 +1463,6 @@
|
||||
"sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
|
||||
"sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.11.0"
|
||||
},
|
||||
"pygments": {
|
||||
@@ -1578,7 +1470,6 @@
|
||||
"sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb",
|
||||
"sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.12.0"
|
||||
},
|
||||
"pyparsing": {
|
||||
@@ -1586,7 +1477,6 @@
|
||||
"sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb",
|
||||
"sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"
|
||||
],
|
||||
"markers": "python_full_version >= '3.6.8'",
|
||||
"version": "==3.0.9"
|
||||
},
|
||||
"pytest": {
|
||||
@@ -1631,11 +1521,11 @@
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
"sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f",
|
||||
"sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.27.1"
|
||||
"version": "==2.28.0"
|
||||
},
|
||||
"snowballstemmer": {
|
||||
"hashes": [
|
||||
@@ -1665,7 +1555,6 @@
|
||||
"sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a",
|
||||
"sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.0.2"
|
||||
},
|
||||
"sphinxcontrib-devhelp": {
|
||||
@@ -1673,7 +1562,6 @@
|
||||
"sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e",
|
||||
"sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.0.2"
|
||||
},
|
||||
"sphinxcontrib-htmlhelp": {
|
||||
@@ -1681,7 +1569,6 @@
|
||||
"sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07",
|
||||
"sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.0.0"
|
||||
},
|
||||
"sphinxcontrib-jsmath": {
|
||||
@@ -1689,7 +1576,6 @@
|
||||
"sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178",
|
||||
"sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.0.1"
|
||||
},
|
||||
"sphinxcontrib-qthelp": {
|
||||
@@ -1697,7 +1583,6 @@
|
||||
"sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72",
|
||||
"sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.0.3"
|
||||
},
|
||||
"sphinxcontrib-serializinghtml": {
|
||||
@@ -1705,7 +1590,6 @@
|
||||
"sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd",
|
||||
"sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==1.1.5"
|
||||
},
|
||||
"tomli": {
|
||||
@@ -1713,7 +1597,6 @@
|
||||
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
|
||||
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==2.0.1"
|
||||
},
|
||||
"typing-extensions": {
|
||||
@@ -1721,7 +1604,6 @@
|
||||
"sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708",
|
||||
"sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==4.2.0"
|
||||
},
|
||||
"urllib3": {
|
||||
@@ -1729,7 +1611,6 @@
|
||||
"sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14",
|
||||
"sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"version": "==1.26.9"
|
||||
},
|
||||
"zipp": {
|
||||
@@ -1737,7 +1618,6 @@
|
||||
"sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad",
|
||||
"sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==3.8.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -237,12 +237,12 @@ class Post:
|
||||
|
||||
# replace is here in order to prevent catastrophic backtracking
|
||||
urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
|
||||
self.outlinks = urls
|
||||
self.outlinks += urls
|
||||
|
||||
HASHTAG_REGEX = r"(?:^|\s)[##]{1}(\w+)"
|
||||
|
||||
hashtags = re.findall(HASHTAG_REGEX, self.content)
|
||||
self.hashtags = hashtags
|
||||
self.hashtags += hashtags
|
||||
|
||||
# regex patterns for finding crypto addresses
|
||||
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
|
||||
|
||||
@@ -421,6 +421,9 @@ class ScraperController:
|
||||
session.commit()
|
||||
added += 1
|
||||
|
||||
profile = scraper.get_profile(channel)
|
||||
session.add(profile)
|
||||
|
||||
session.commit()
|
||||
logger.info(
|
||||
f"{scraper} found {added} new posts from {channel}")
|
||||
|
||||
@@ -5,6 +5,7 @@ from html.parser import HTMLParser
|
||||
import dateparser
|
||||
import json
|
||||
from typing import Generator
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -70,7 +71,7 @@ class BitchuteScraper(Scraper):
|
||||
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
@logger.catch
|
||||
@logger.catch(reraise = True)
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
base_url = channel.url
|
||||
@@ -104,7 +105,7 @@ class BitchuteScraper(Scraper):
|
||||
profile = {
|
||||
'description' : description_soup.text.strip(),
|
||||
'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
|
||||
'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
|
||||
'created': parse_created(re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. '))),
|
||||
'videos' : int(info_list[1].text.split('videos')[0].strip()),
|
||||
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
|
||||
'owner_name' : owner_name,
|
||||
@@ -116,7 +117,7 @@ class BitchuteScraper(Scraper):
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
raw_data=json.dumps(profile, default = str),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
@@ -484,4 +485,14 @@ def decode_cfemail(cfemail):
|
||||
|
||||
return email
|
||||
|
||||
#---------------------------------------------------------------------------#
|
||||
#---------------------------------------------------------------------------#
|
||||
|
||||
def parse_created(created):
|
||||
|
||||
period_list = ['year', 'month', 'week', 'day']
|
||||
|
||||
periods = [period.strip() for period in created.split('ago')[0].strip().split(',')]
|
||||
_kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
|
||||
kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()}
|
||||
|
||||
return datetime.now() - relativedelta(**kwargs)
|
||||
@@ -14,7 +14,7 @@ BASE_URL = 'https://rumble.com'
|
||||
|
||||
class RumbleScraper(Scraper):
|
||||
"""An implementation of a Scraper for Rumble, using custom functions"""
|
||||
__version__ = "RumbleScraper 0.0.1"
|
||||
__version__ = "RumbleScraper 0.0.2"
|
||||
|
||||
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
|
||||
cookiefilename = 'cookiefile.txt'
|
||||
@@ -105,20 +105,38 @@ def process_video(video):
|
||||
views = None
|
||||
else:
|
||||
views = view_span.get('data-value')
|
||||
|
||||
|
||||
author_a = video.find('a', {'rel': 'author'})
|
||||
if author_a is None:
|
||||
author_id = None
|
||||
author_name = None
|
||||
else:
|
||||
author_id = author_a['href'].split('/')[-1]
|
||||
author_name = author_a.text
|
||||
|
||||
video_link = BASE_URL + video.find('a', href = True)['href']
|
||||
r = make_request(url = video_link)
|
||||
soup = BeautifulSoup(r.content, features = 'html.parser')
|
||||
|
||||
content_div = soup.find('div', {'class': 'container content media-description'})
|
||||
|
||||
info = {
|
||||
'title' : video.find('h3').text,
|
||||
'thumbnail' : video.find('img')['src'],
|
||||
'link' : BASE_URL + video.find('a', href = True)['href'],
|
||||
'link' : video_link,
|
||||
'views' : views,
|
||||
'rumbles' : rumbles,
|
||||
'content': '' if content_div is None else content_div.get_text('\n'),
|
||||
'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'],
|
||||
'datetime' : datetime.fromisoformat(video.find('time')['datetime'])}
|
||||
'datetime' : datetime.fromisoformat(video.find('time')['datetime']),
|
||||
'author_id': author_id,
|
||||
'author_name': author_name}
|
||||
|
||||
info['media_url'] = get_media_url(info['link'])
|
||||
|
||||
return info
|
||||
|
||||
|
||||
def get_channel_videos(url):
|
||||
|
||||
page = 1
|
||||
@@ -150,8 +168,15 @@ def get_channel_profile(url):
|
||||
thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'})
|
||||
cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'})
|
||||
|
||||
author_a = soup.find('a', {'rel': 'author'})
|
||||
if author_a is None:
|
||||
author_id = None
|
||||
else:
|
||||
author_id = author_a['href'].split('/')[-1]
|
||||
|
||||
profile = {
|
||||
'name': soup.find('h1').text,
|
||||
'id': author_id,
|
||||
'verified': verified_svg is not None,
|
||||
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
|
||||
'cover': cover_soup.get('src') if cover_soup else None,
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
from .base import ETLController
|
||||
from .twitter import TwitterTransformer
|
||||
from .bitchute import BitchuteTransformer
|
||||
from .telegram_telethon import TelegramTelethonTransformer
|
||||
from .telegram_telethon import TelegramTelethonTransformer
|
||||
from .rumble import RumbleTransformer
|
||||
from .gettr import GettrTransformer
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from typing import Generator
|
||||
from typing import Generator, Union, Callable
|
||||
from datetime import datetime, timezone
|
||||
import dateutil.parser
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import ScraperResult, Post, Image, Video, Media
|
||||
from cisticola.base import RawChannelInfo, ScraperResult, Post, Image, Video, Media, Channel, ChannelInfo
|
||||
|
||||
class BitchuteTransformer(Transformer):
|
||||
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
|
||||
@@ -19,7 +21,7 @@ class BitchuteTransformer(Transformer):
|
||||
|
||||
return False
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
|
||||
def transform_media(self, data: ScraperResult, insert: Callable, transformed: Post) -> Generator[Media, None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
orig = raw['video_url']
|
||||
@@ -27,9 +29,34 @@ class BitchuteTransformer(Transformer):
|
||||
|
||||
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
|
||||
|
||||
yield m
|
||||
insert(m)
|
||||
|
||||
def transform(self, data: ScraperResult) -> Post:
|
||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = ChannelInfo(
|
||||
raw_channel_info_id=data.id,
|
||||
channel=data.channel,
|
||||
platform_id=raw['owner_url'].strip('/').split('/')[-1],
|
||||
platform=data.platform,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
screenname=raw['owner_name'],
|
||||
name=raw['owner_name'],
|
||||
description=raw['description'],
|
||||
description_url='', # does not exist for Bitchute
|
||||
description_location='', # does not exist for Bitchute
|
||||
followers=raw['subscribers'],
|
||||
following=-1, # does not exist for Bitchute
|
||||
verified=False, # does not exist for Bitchute
|
||||
date_created=dateutil.parser.parse(raw['created']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
soup = BeautifulSoup(raw['body'], features = 'html.parser')
|
||||
@@ -37,15 +64,17 @@ class BitchuteTransformer(Transformer):
|
||||
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
platform_id=raw['id'],
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=data.channel,
|
||||
date=data.date,
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url=raw['url'],
|
||||
content=content,
|
||||
author_id=raw['author_id'],
|
||||
author_username=raw['author'])
|
||||
|
||||
return transformed
|
||||
transformed = insert(transformed)
|
||||
78
cisticola/transformer/gettr.py
Normal file
78
cisticola/transformer/gettr.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from typing import Generator, Union, Callable
|
||||
import dateutil.parser
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
||||
|
||||
class GettrTransformer(Transformer):
|
||||
"""A Gettr specific ScraperResult, with a method ETL/transforming"""
|
||||
|
||||
__version__ = "GettrTransformer 0.0.1"
|
||||
|
||||
def can_handle(self, data: ScraperResult) -> bool:
|
||||
scraper = data.scraper.split(' ')
|
||||
if scraper[0] == "GettrScraper":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = ChannelInfo(
|
||||
raw_channel_info_id=data.id,
|
||||
channel=data.channel,
|
||||
platform_id=raw['_id'],
|
||||
platform=data.platform,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
screenname=raw['username'],
|
||||
name=raw['nickname'],
|
||||
description=raw['dsc'],
|
||||
description_url=raw['website'],
|
||||
description_location=raw['location'],
|
||||
followers=raw['flg'],
|
||||
following=raw['flw'],
|
||||
verified=True if raw.get('infl') else False,
|
||||
date_created=datetime.fromtimestamp(raw['cdate']*0.001),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
if raw["activity"]["action"] == "shares_pst":
|
||||
forwarded_from = raw["activity"]["uid"]
|
||||
else:
|
||||
forwarded_from = None
|
||||
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
platform_id=raw["_id"],
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=data.channel,
|
||||
date=datetime.fromtimestamp(raw["activity"]["cdate"] / 1000.0),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url="https://www.gettr.com/post/" + raw["_id"],
|
||||
content=raw.get("txt", ""),
|
||||
author_id=raw["receiver_id"],
|
||||
author_username=raw["uid"],
|
||||
hashtags=raw.get("htgs", []),
|
||||
outlinks = list(filter(None, [raw.get("prevsrc")])),
|
||||
forwarded_from = forwarded_from)
|
||||
|
||||
insert(transformed)
|
||||
|
||||
# media = self.process_media(raw, transformed.id, data)
|
||||
# for m in media:
|
||||
# insert(m)
|
||||
70
cisticola/transformer/rumble.py
Normal file
70
cisticola/transformer/rumble.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from typing import Generator, Union, Callable
|
||||
import dateutil.parser
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
||||
|
||||
class RumbleTransformer(Transformer):
|
||||
"""A Rumble specific ScraperResult, with a method ETL/transforming"""
|
||||
|
||||
__version__ = "RumbleTransformer 0.0.1"
|
||||
|
||||
def can_handle(self, data: ScraperResult) -> bool:
|
||||
scraper = data.scraper.split(' ')
|
||||
if scraper[0] == "RumbleScraper":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = ChannelInfo(
|
||||
raw_channel_info_id=data.id,
|
||||
channel=data.channel,
|
||||
platform_id=raw['id'],
|
||||
platform=data.platform,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
screenname=raw['id'],
|
||||
name=raw['name'],
|
||||
description='', # does not exist for Rumble
|
||||
description_url='', # does not exist for Rumble
|
||||
description_location='', # does not exist for Rumble
|
||||
followers=raw['subscribers'],
|
||||
following=-1, # does not exist for Rumble
|
||||
verified=raw['verified'],
|
||||
date_created=None, # does not exist for Rumble
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
platform_id=raw['media_url'].strip('/').split('/')[-1],
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=data.channel,
|
||||
date=dateutil.parser.parse(raw['datetime']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url=raw['link'],
|
||||
content=raw['content'],
|
||||
author_id=raw['author_id'],
|
||||
author_username=raw['author_name'])
|
||||
|
||||
insert(transformed)
|
||||
|
||||
# media = self.process_media(raw, transformed.id, data)
|
||||
# for m in media:
|
||||
# insert(m)
|
||||
@@ -61,7 +61,7 @@ class TelegramTelethonTransformer(Transformer):
|
||||
self.bad_channels[orig_screenname] = True
|
||||
return ""
|
||||
|
||||
soup = BeautifulSoup(r.content)
|
||||
soup = BeautifulSoup(r.content, features = 'lxml')
|
||||
post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id)})
|
||||
name = ""
|
||||
|
||||
@@ -181,14 +181,14 @@ class TelegramTelethonTransformer(Transformer):
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
# for k in data.archived_urls:
|
||||
# if data.archived_urls[k]:
|
||||
# archived_url = data.archived_urls[k]
|
||||
# ext = archived_url.split('.')[-1]
|
||||
for k in data.archived_urls:
|
||||
if data.archived_urls[k]:
|
||||
archived_url = data.archived_urls[k]
|
||||
ext = archived_url.split('.')[-1]
|
||||
|
||||
# if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
|
||||
# insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
# else:
|
||||
# insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
|
||||
insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
else:
|
||||
insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
|
||||
|
||||
@@ -2,9 +2,10 @@ import json
|
||||
from loguru import logger
|
||||
from typing import Generator, Union, Callable
|
||||
import dateutil.parser
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel
|
||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
||||
|
||||
class TwitterTransformer(Transformer):
|
||||
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
|
||||
@@ -45,8 +46,33 @@ class TwitterTransformer(Transformer):
|
||||
|
||||
yield m
|
||||
|
||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
transformed = ChannelInfo(
|
||||
raw_channel_info_id=data.id,
|
||||
channel=data.channel,
|
||||
platform_id=raw['id'],
|
||||
platform=data.platform,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
screenname=raw['username'],
|
||||
name=raw['displayname'],
|
||||
description=raw['rawDescription'],
|
||||
description_url=raw['linkUrl'],
|
||||
description_location=raw['location'],
|
||||
followers=raw['followersCount'],
|
||||
following=raw['friendsCount'],
|
||||
verified=raw['verified'],
|
||||
date_created=dateutil.parser.parse(raw['created']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = Post(
|
||||
@@ -58,6 +84,7 @@ class TwitterTransformer(Transformer):
|
||||
channel=data.channel,
|
||||
date=dateutil.parser.parse(raw['date']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url=raw['url'],
|
||||
content=raw['content'],
|
||||
author_id=raw['user']['id'],
|
||||
@@ -85,6 +112,7 @@ class TwitterTransformer(Transformer):
|
||||
channel=channel.id,
|
||||
date=dateutil.parser.parse(tweet['date']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url=tweet['url'],
|
||||
content=tweet['content'],
|
||||
author_id=tweet['user']['id'],
|
||||
@@ -109,7 +137,4 @@ class TwitterTransformer(Transformer):
|
||||
|
||||
media = self.process_media(raw, transformed.id, data)
|
||||
for m in media:
|
||||
insert(m)
|
||||
|
||||
|
||||
|
||||
insert(m)
|
||||
34
tests/transformer/bitchute.py
Normal file
34
tests/transformer/bitchute.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import BitchuteScraper
|
||||
from cisticola.transformer import BitchuteTransformer
|
||||
from cisticola.base import Post, Media
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_etl_bitchute(engine, controller, etl_controller, channel_kwargs):
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['bitchute'])]
|
||||
controller.register_scraper(scraper = BitchuteScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
etl_controller.register_transformer(BitchuteTransformer())
|
||||
etl_controller.transform_all_untransformed()
|
||||
etl_controller.transform_all_untransformed_info()
|
||||
|
||||
sessionfactory = sessionmaker()
|
||||
sessionfactory.configure(bind=engine)
|
||||
session = sessionfactory()
|
||||
|
||||
posts = session.query(Post).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 5
|
||||
# assert len(media) == 0
|
||||
|
||||
assert 'Pendant are something that the advanced ladies can fuse in her every day look' in posts[0].content
|
||||
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"
|
||||
34
tests/transformer/gettr.py
Normal file
34
tests/transformer/gettr.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import GettrScraper
|
||||
from cisticola.transformer import GettrTransformer
|
||||
from cisticola.base import Post, Media
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_etl_gettr(engine, controller, etl_controller, channel_kwargs):
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['gettr'])]
|
||||
controller.register_scraper(scraper = GettrScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
etl_controller.register_transformer(GettrTransformer())
|
||||
etl_controller.transform_all_untransformed()
|
||||
etl_controller.transform_all_untransformed_info()
|
||||
|
||||
sessionfactory = sessionmaker()
|
||||
sessionfactory.configure(bind=engine)
|
||||
session = sessionfactory()
|
||||
|
||||
posts = session.query(Post).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 23
|
||||
# assert len(media) == 0
|
||||
|
||||
assert 'Nigerian gender studies' in posts[-1].content
|
||||
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"
|
||||
34
tests/transformer/rumble.py
Normal file
34
tests/transformer/rumble.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import RumbleScraper
|
||||
from cisticola.transformer import RumbleTransformer
|
||||
from cisticola.base import Post, Media
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_etl_rumble(engine, controller, etl_controller, channel_kwargs):
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['rumble'])]
|
||||
controller.register_scraper(scraper = RumbleScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
etl_controller.register_transformer(RumbleTransformer())
|
||||
etl_controller.transform_all_untransformed()
|
||||
etl_controller.transform_all_untransformed_info()
|
||||
|
||||
sessionfactory = sessionmaker()
|
||||
sessionfactory.configure(bind=engine)
|
||||
session = sessionfactory()
|
||||
|
||||
posts = session.query(Post).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 7
|
||||
# assert len(media) == 0
|
||||
|
||||
assert '#whitegold #icedoutcuban' in posts[0].content
|
||||
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"
|
||||
34
tests/transformer/telegram_telethon.py
Normal file
34
tests/transformer/telegram_telethon.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from sqlalchemy.orm import sessionmaker, with_polymorphic
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TelegramTelethonScraper
|
||||
from cisticola.transformer import TelegramTelethonTransformer
|
||||
from cisticola.base import Post, Media
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channel_kwargs):
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['telegram'])]
|
||||
controller.register_scraper(scraper = TelegramTelethonScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
etl_controller.register_transformer(TelegramTelethonTransformer())
|
||||
etl_controller.transform_all_untransformed()
|
||||
etl_controller.transform_all_untransformed_info()
|
||||
|
||||
sessionfactory = sessionmaker()
|
||||
sessionfactory.configure(bind=engine)
|
||||
session = sessionfactory()
|
||||
|
||||
posts = session.query(Post).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 19
|
||||
assert len(media) == 13
|
||||
|
||||
assert posts[16].content == "Taking pre-orders now"
|
||||
assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280"
|
||||
@@ -1,4 +1,4 @@
|
||||
from sqlalchemy.orm import sessionmaker, with_polymorphic
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import json
|
||||
|
||||
import pytest
|
||||
@@ -18,6 +18,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
|
||||
|
||||
etl_controller.register_transformer(TwitterTransformer())
|
||||
etl_controller.transform_all_untransformed()
|
||||
etl_controller.transform_all_untransformed_info()
|
||||
|
||||
sessionfactory = sessionmaker()
|
||||
sessionfactory.configure(bind=engine)
|
||||
@@ -26,8 +27,8 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
|
||||
posts = session.query(Post).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 10
|
||||
assert len(media) == 7
|
||||
assert len(posts) == 12
|
||||
assert len(media) == 4
|
||||
|
||||
assert posts[-1].content == "BARN"
|
||||
assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728"
|
||||
assert posts[2].content == "BARN"
|
||||
assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"
|
||||
Reference in New Issue
Block a user