diff --git a/Pipfile b/Pipfile index cb39b78..897c84d 100644 --- a/Pipfile +++ b/Pipfile @@ -27,7 +27,7 @@ langdetect = "*" spacy = "==3.2.4" ocrd-pyexiftool = "*" gabber = {git = "https://github.com/stanfordio/gabber.git"} -snscrape = {git = "https://github.com/bellingcat/snscrape"} +snscrape = {path = "/home/work/Documents/Bellingcat/qanon_project/snscrape", editable = true} polyphemus = {git = "https://github.com/bellingcat/polyphemus"} [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 6875bcd..c77d7b5 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "13cc50755a59b2cd8bf93049a9a695aa27d35b973b0bdc154af5d21ce48fd57f" + "sha256": "373467351f214f6ce3c87b0914e4d2be8e5b48881304e94ca7a394b0dd7fb9f6" }, "pipfile-spec": 6, "requires": { @@ -21,46 +21,48 @@ "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30", "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693" ], + "markers": "python_version >= '3.6'", "version": "==4.11.1" }, "blis": { "hashes": [ - "sha256:148f59a0a47a38ce82e3afc50c709494d5e5a494bef28ce1519c7a17346c645b", - "sha256:1667db8439d9ca41c0c1f0ea954d87462be01b125436c4b264f73603c9fb4e82", - "sha256:3e024f103522e72a27019cfcfe14569522a394f5d651565560a18040fdd69a6c", - "sha256:4a48eeaa506f176bcac306378f5e8063697c93e26d2418fcbe053e8912019090", - "sha256:5d4a81f9438db7a19ac8e64ad41331f65a659ea8f3bb1889a9c2088cfd9fe104", - "sha256:64bef63b1abd5b41819ea53897bdbc03c631a59c1757a9393e6ae0828692f31c", - "sha256:680480dfa16b354f2e4d584edb8d36f0505ed8df12939beee2d161aea7bb3609", - "sha256:76d13dbcd648ca33dfc83569bb219d0696e4f6e5ad00b9f538332a3bdb28ff30", - "sha256:7865e39cac4e10506afc49213938fb7e13bf73ca980c9c20ffad2de4ef858f43", - "sha256:929a6e70b800f9df505f08ed3e863bf5fd0a209aed45fb38a0fd2b8342c04981", - "sha256:a0183760604b14e8eb671a431d06606594def03c36aaaa2a2e7b7f88382dac76", - "sha256:b1e0567cde024e6ef677fe825d934baa7362cd71450c98e5198538026a86e896", - "sha256:b5e0acc760daf5c3b45bce44653943e3a04d81c21c5b92213ed51664525dc24e", - "sha256:bead485e5d79d3eb62a8df55618743878fb3cba606aaf926153db5803270b185", - "sha256:cfb7d730fef706f3ea4389196ce5f610f24cc83f828c498a275c12f05f0cf5c4", - "sha256:d6055ced65d6581ab4f1da0d3f6ec14c60512474c5c9b3210c9f30dd7dd1447d", - "sha256:e22145110864bcffb1d52cb57050b67b8a8ecd43c7c0a1ac0bcdb2c85c8bf416", - "sha256:ee19fddb5964570d97c2096a9a1e595fa48abdde187b14f99dcea7bb546989a6", - "sha256:f4109cce38e644e81d923836b34024905d59e88c8fb48b89b420f4d7661cd89f" + "sha256:0f7bfdee74ac695c35360ace00f2630c1b47406dc0b99ba9211bfa8588bfbed9", + "sha256:159a1a9b32213d99d1415789ac66ed8d23442a696d9d376c66d7b791d3eae575", + "sha256:17df5ac7d9a9dbbf0415f8f8392fbdf1790fa394f89d695bae5e2e7e361c852b", + "sha256:1e970ba1eb12ca38fb5d57f379472125bc3f5106c8214dc847fe79b027212135", + "sha256:1f5fa330ab66d0e92a845b1db361ec8bf3dc4bc7e0dc0ded94f36b8e9f731650", + "sha256:2778fe0ba0e25c157839fdd19ed66b9a340c92d4e92e707b7fa9aa21c51cb254", + "sha256:294421b720c2de904908de841464c667e1a5c5e9f3db6931dfa29cf369d3653a", + "sha256:2db369a4f95927be37e11790dd1ccbf99fd6201eaffbcf408546db847b7b5740", + "sha256:4e7b7b8bc8cf5e82958bbc393e0167318a930d394cbbf04c1ba18cfabaef5818", + "sha256:63735128c9cae44dc6cbf7557327385df0c4ed2dc2c45a00dabfde1e4d00802d", + "sha256:66b8ca1a2eb8f1e0563a592aae4b8682b66189ad560e3b8221d93eab0cb76582", + "sha256:90f17543e0aa3bc379d139867467df2c365ffaf5b61988de12dbba6dbbc9fab4", + "sha256:95d22d3007cb454d11a478331690629861f7d40b4668f9fccfd13b6507ed099b", + "sha256:96ff4c0c1ceab9f94c14b3281f3cef82f593c48c3b5f6169bd51cdcd315e0a6e", + "sha256:ae5b06fe3b94645ac5d93cbc7c0129639cc3e0d50b4efb361a20a9e160277a92", + "sha256:bf60f634481c3d0faf831ac4f2d1c75343e98f714dc88e3fb3c329758577e772", + "sha256:bfa56e7ef14ae607d8444eb344d22f252a2e0b0f9bfa4bdc9b0c48a9f96b5461", + "sha256:f576ad64b772b6fd7df6ef94986235f321983dc870d0f76d78c931bafc41cfa4", + "sha256:f7d541bb06323aa350163ba4a3ad00e8effb3b53d4c58ee6228224f3928b6c57" ], - "version": "==0.7.7" + "version": "==0.7.8" }, "boto3": { "hashes": [ - "sha256:1c13d555172cf88eb645af2429e4a7f42be85e365d6ffc110c952a556d3f8808", - "sha256:4af6a8bc5110b5f9d2fbd00a3c110e4c4cc36fae78d05afa354831f5789e363b" + "sha256:2c6f7e4103d41ca07d6b934a6612e4b9a2666eae36e8289f88726868534b8de2", + "sha256:422c000ff2ee5226e89fe427a9c4c09db095d69c179a3bcc3cfba37cbc5e787e" ], "index": "pypi", - "version": "==1.24.6" + "version": "==1.24.16" }, "botocore": { "hashes": [ - "sha256:97c909a6ec5ad421573c18ae67fc6ea4232502cd30cffaf03bfcb584d9df652d", - "sha256:eeebe304161db6828413dc358ea80ece52f4ddbc8ecde4dd58978d5861a09293" + "sha256:b3b9710902f675a11f5bfd46afda770150530876ae6541d099584462bf949fd1", + "sha256:f117d59899d21beeb200130d7af2090a8112d702a06e2c2794ef576bcea36773" ], - "version": "==1.27.6" + "markers": "python_version >= '3.7'", + "version": "==1.27.16" }, "brotli": { "hashes": [ @@ -142,6 +144,7 @@ "sha256:6a94c6402995a99c3970cc7e4884bb60b4a8639938157eeed436098bf9831757", "sha256:f9f17d2aec496a9aa6b76f53e3b614c965223c061982d434d160f930c698a9db" ], + "markers": "python_version ~= '3.7'", "version": "==5.2.0" }, "catalogue": { @@ -149,20 +152,23 @@ "sha256:535d33ae79ebd21ca298551d85da186ae8b8e1df36b0fb0246da774163ec2d6b", "sha256:cab4feda641fe05da1e6a1a9d123b0869d5ca324dcd93d4a5c384408ab62e7fb" ], + "markers": "python_version >= '3.6'", "version": "==2.0.7" }, "certifi": { "hashes": [ - "sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7", - "sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a" + "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d", + "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412" ], - "version": "==2022.5.18.1" + "markers": "python_version >= '3.6'", + "version": "==2022.6.15" }, "charset-normalizer": { "hashes": [ "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" ], + "markers": "python_version >= '3.5'", "version": "==2.0.12" }, "click": { @@ -170,6 +176,7 @@ "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1", "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb" ], + "markers": "python_version >= '3.6'", "version": "==8.0.4" }, "cryptg": { @@ -254,14 +261,24 @@ "index": "pypi", "version": "==0.2.0" }, + "filelock": { + "hashes": [ + "sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404", + "sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04" + ], + "markers": "python_version >= '3.7'", + "version": "==3.7.1" + }, "future": { "hashes": [ "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d" ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==0.18.2" }, "gabber": { - "git": "https://github.com/stanfordio/gabber.git" + "git": "https://github.com/stanfordio/gabber.git", + "ref": "a032db8047fa6b762b2fc127b08ee37d6ad9e110" }, "gogettr": { "hashes": [ @@ -273,16 +290,18 @@ }, "google-auth": { "hashes": [ - "sha256:8a954960f852d5f19e6af14dd8e75c20159609e85d8db37e4013cc8c3824a7e1", - "sha256:df549a1433108801b11bdcc0e312eaf0d5f0500db42f0523e4d65c78722e8475" + "sha256:819b70140d05501739e1387291d39f0de3b4dff3b00ae4aff8e7a05369957f89", + "sha256:9b1da39ab8731c3061f36fefde9f8bb902dbee9eb28e3a67e8cfa7dc1be76227" ], - "version": "==2.7.0" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", + "version": "==2.8.0" }, "google-auth-oauthlib": { "hashes": [ "sha256:6d6161d0ec0a62e2abf2207c6071c117ec5897b300823c4bb2d963ee86e20e4f", "sha256:d5e98a71203330699f92a26bc08847a92e8c3b1b8d82a021f1af34164db143ae" ], + "markers": "python_version >= '3.6'", "version": "==0.5.2" }, "greenlet": { @@ -342,7 +361,7 @@ "sha256:f27f0875e0873f6bf5df09a456bfcac0667824cabac4cad30b43f36e0382ffe7", "sha256:fcd4a6d04995f1d66bc78b503e4e59ae72fd32aaec4f661657fe5ae5c1aa4ce3" ], - "markers": "python_version >= '3' and (platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32'))))))", + "markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))", "version": "==2.0.0a2" }, "gspread": { @@ -358,6 +377,7 @@ "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" ], + "markers": "python_version >= '3.5'", "version": "==3.3" }, "instaloader": { @@ -372,20 +392,23 @@ "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852", "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61" ], + "markers": "python_version >= '3.7'", "version": "==3.1.2" }, "jmespath": { "hashes": [ - "sha256:a490e280edd1f57d6de88636992d05b71e97d69a26a19f058ecf7d304474bf5e", - "sha256:e8dcd576ed616f14ec02eed0005c85973b5890083313860136657e24784e4c04" + "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", + "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe" ], - "version": "==1.0.0" + "markers": "python_version >= '3.7'", + "version": "==1.0.1" }, "langcodes": { "hashes": [ "sha256:4d89fc9acb6e9c8fdef70bcdf376113a3db09b67285d9e1d534de6d8818e7e69", "sha256:794d07d5a28781231ac335a1561b8442f8648ca07cd518310aeb45d6f0807ef6" ], + "markers": "python_version >= '3.6'", "version": "==3.3.0" }, "langdetect": { @@ -398,11 +421,11 @@ }, "loguru": { "hashes": [ - "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c", - "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3" + "sha256:b28e72ac7a98be3d28ad28570299a393dfcd32e5e3f6a353dec94675767b6319", + "sha256:f8087ac396b5ee5f67c963b495d615ebbceac2796379599820e324419d53667c" ], "index": "pypi", - "version": "==0.6.0" + "version": "==0.5.3" }, "lxml": { "hashes": [ @@ -516,6 +539,7 @@ "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a", "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7" ], + "markers": "python_version >= '3.7'", "version": "==2.1.1" }, "murmurhash": { @@ -547,40 +571,43 @@ "sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1", "sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed" ], + "markers": "python_version >= '3.5' and python_version < '4.0'", "version": "==1.45.1" }, "numpy": { "hashes": [ - "sha256:020218fc82390f1d537cb193d6f1449a919ec97df69b5a64c0a1d017486e0032", - "sha256:04e4dbe6b777e977813e7ff5f43aa030ef4f6f75cbc1a4504d3135942b5c12fe", - "sha256:1c881827ff0ad7d607047c19a075a7d7c7125cc103fb969a9200bad26175fb9d", - "sha256:1f22f6f3cb7094ad77c8d352e4bfd2c1db1c38bc08d0b6c74e9b46343c53b052", - "sha256:279dce16b143bc50d49bab52dc279d6ab5b0edc7f4d2cc7edaf6a547586bda7e", - "sha256:2e66decdea13ae8091ba480209dd5ce31261fa3b021ec06b30bd2f4a304861b7", - "sha256:548f4d86aa259a448f2da0c07df070bf1f71b68c1f84b1356d4a2ed832598758", - "sha256:6fbd492bead87ab83240c56b3490ac301595ab1399ace3e3c1b7c130e3529358", - "sha256:785d6520f7bf10ff188762bc460579d6a31c11f960976b2a29efc383b0346572", - "sha256:804293d9bdf33f9c9fb0b4a753f9e84114bb0ad538d184fc579b30782326c827", - "sha256:82e69890c394a4e1cbcaf12b47d8477bbac4635866fc46a77670abbe4bb4085d", - "sha256:8e8a88657c028b8b77f3df6f266a5e6ffb4419cbc3dfb525cbbb80ba710f5da2", - "sha256:9793feff4758c68502f7652fab08e5ec427d9973d26014767cc15c1b1d885f56", - "sha256:9810b840a751b6f0c73c21fb2a50e306d7d0be4114cded4c7d069e142ce488cf", - "sha256:ae7e8801b93124a6b0becedc06285ddbaca2daab2d30e35ea413d3bec252717a", - "sha256:b7be00b0a76384490845395714e62f597e64bc6dc8f8a14be0e96034dde3667c", - "sha256:c308afc8ec782badd073999385a6c93c27ee68e6c0991697394d4fd56566af1f", - "sha256:c335800064f04e0b474b64779ab234ae23c0a5b2f5a06284bb07d297d73692bd", - "sha256:d17f7feb2cca596daa4b3dae86b611a13e9ace061e6583a8db21841f529ca891", - "sha256:dbc987d14f46ae4c476068543d3ad2a20e7ebcb06b211eb4292224dc136eb01d", - "sha256:f5a1c7c45ff29db501f9e38a360aedd833e355c14c75155ba2bd46ee3799e30a", - "sha256:fde47931544086a648b12ee7c9ccf30edd6c6db776005fb07e4a019a04980042" + "sha256:092f5e6025813e64ad6d1b52b519165d08c730d099c114a9247c9bb635a2a450", + "sha256:196cd074c3f97c4121601790955f915187736f9cf458d3ee1f1b46aff2b1ade0", + "sha256:1c29b44905af288b3919803aceb6ec7fec77406d8b08aaa2e8b9e63d0fe2f160", + "sha256:2b2da66582f3a69c8ce25ed7921dcd8010d05e59ac8d89d126a299be60421171", + "sha256:5043bcd71fcc458dfb8a0fc5509bbc979da0131b9d08e3d5f50fb0bbb36f169a", + "sha256:58bfd40eb478f54ff7a5710dd61c8097e169bc36cc68333d00a9bcd8def53b38", + "sha256:79a506cacf2be3a74ead5467aee97b81fca00c9c4c8b3ba16dbab488cd99ba10", + "sha256:94b170b4fa0168cd6be4becf37cb5b127bd12a795123984385b8cd4aca9857e5", + "sha256:97a76604d9b0e79f59baeca16593c711fddb44936e40310f78bfef79ee9a835f", + "sha256:98e8e0d8d69ff4d3fa63e6c61e8cfe2d03c29b16b58dbef1f9baa175bbed7860", + "sha256:ac86f407873b952679f5f9e6c0612687e51547af0e14ddea1eedfcb22466babd", + "sha256:ae8adff4172692ce56233db04b7ce5792186f179c415c37d539c25de7298d25d", + "sha256:bd3fa4fe2e38533d5336e1272fc4e765cabbbde144309ccee8675509d5cd7b05", + "sha256:d0d2094e8f4d760500394d77b383a1b06d3663e8892cdf5df3c592f55f3bff66", + "sha256:d54b3b828d618a19779a84c3ad952e96e2c2311b16384e973e671aa5be1f6187", + "sha256:d6ca8dabe696c2785d0c8c9b0d8a9b6e5fdbe4f922bde70d57fa1a2848134f95", + "sha256:d8cc87bed09de55477dba9da370c1679bd534df9baa171dd01accbb09687dac3", + "sha256:f0f18804df7370571fb65db9b98bf1378172bd4e962482b857e612d1fec0f53e", + "sha256:f1d88ef79e0a7fa631bb2c3dda1ea46b32b1fe614e10fedd611d3d5398447f2f", + "sha256:f9c3fc2adf67762c9fe1849c859942d23f8d3e0bee7b5ed3d4a9c3eeb50a2f07", + "sha256:fc431493df245f3c627c0c05c2bd134535e7929dbe2e602b80e42bf52ff760bc", + "sha256:fe8b9683eb26d2c4d5db32cd29b38fdcf8381324ab48313b5b69088e0e355379" ], - "version": "==1.23.0rc2" + "markers": "python_version >= '3.8'", + "version": "==1.23.0" }, "oauthlib": { "hashes": [ "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2", "sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe" ], + "markers": "python_version >= '3.6'", "version": "==3.2.0" }, "ocrd-pyexiftool": { @@ -597,13 +624,42 @@ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522" ], + "markers": "python_version >= '3.6'", "version": "==21.3" }, + "pandas": { + "hashes": [ + "sha256:07238a58d7cbc8a004855ade7b75bbd22c0db4b0ffccc721556bab8a095515f6", + "sha256:0daf876dba6c622154b2e6741f29e87161f844e64f84801554f879d27ba63c0d", + "sha256:16ad23db55efcc93fa878f7837267973b61ea85d244fc5ff0ccbcfa5638706c5", + "sha256:1d9382f72a4f0e93909feece6fef5500e838ce1c355a581b3d8f259839f2ea76", + "sha256:24ea75f47bbd5574675dae21d51779a4948715416413b30614c1e8b480909f81", + "sha256:2893e923472a5e090c2d5e8db83e8f907364ec048572084c7d10ef93546be6d1", + "sha256:2ff7788468e75917574f080cd4681b27e1a7bf36461fe968b49a87b5a54d007c", + "sha256:41fc406e374590a3d492325b889a2686b31e7a7780bec83db2512988550dadbf", + "sha256:48350592665ea3cbcd07efc8c12ff12d89be09cd47231c7925e3b8afada9d50d", + "sha256:605d572126eb4ab2eadf5c59d5d69f0608df2bf7bcad5c5880a47a20a0699e3e", + "sha256:6dfbf16b1ea4f4d0ee11084d9c026340514d1d30270eaa82a9f1297b6c8ecbf0", + "sha256:6f803320c9da732cc79210d7e8cc5c8019aad512589c910c66529eb1b1818230", + "sha256:721a3dd2f06ef942f83a819c0f3f6a648b2830b191a72bbe9451bcd49c3bd42e", + "sha256:755679c49460bd0d2f837ab99f0a26948e68fa0718b7e42afbabd074d945bf84", + "sha256:78b00429161ccb0da252229bcda8010b445c4bf924e721265bec5a6e96a92e92", + "sha256:958a0588149190c22cdebbc0797e01972950c927a11a900fe6c2296f207b1d6f", + "sha256:a3924692160e3d847e18702bb048dc38e0e13411d2b503fecb1adf0fcf950ba4", + "sha256:d51674ed8e2551ef7773820ef5dab9322be0828629f2cbf8d1fc31a0c4fed640", + "sha256:d5ebc990bd34f4ac3c73a2724c2dcc9ee7bf1ce6cf08e87bb25c6ad33507e318", + "sha256:d6c0106415ff1a10c326c49bc5dd9ea8b9897a6ca0c8688eb9c30ddec49535ef", + "sha256:e48fbb64165cda451c06a0f9e4c7a16b534fcabd32546d531b3c240ce2844112" + ], + "markers": "python_version >= '3.8'", + "version": "==1.4.3" + }, "pathy": { "hashes": [ "sha256:25fd04cec6393661113086730ce69c789d121bea83ab1aa18452e8fd42faf29a", "sha256:838624441f799a06b446a657e4ecc9ebc3fdd05234397e044a7c87e8f6e76b1c" ], + "markers": "python_version >= '3.6'", "version": "==0.6.1" }, "pillow": { @@ -647,10 +703,12 @@ "sha256:f3f6a6034140e9e17e9abc175fc7a266a6e63652028e157750bd98e804a8ed9a", "sha256:ffde4c6fabb52891d81606411cbfaf77756e3b561b566efd270b3ed3791fde4e" ], + "markers": "python_version >= '3.7'", "version": "==9.1.1" }, "polyphemus": { - "git": "https://github.com/bellingcat/polyphemus" + "git": "https://github.com/bellingcat/polyphemus", + "ref": "b18e5591fa4f903e5506742c2e3f17d45bb88755" }, "preshed": { "hashes": [ @@ -737,35 +795,39 @@ }, "pycryptodomex": { "hashes": [ - "sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a", - "sha256:298c00ea41a81a491d5b244d295d18369e5aac4b61b77b2de5b249ca61cd6659", - "sha256:2aa887683eee493e015545bd69d3d21ac8d5ad582674ec98f4af84511e353e45", - "sha256:2ce76ed0081fd6ac8c74edc75b9d14eca2064173af79843c24fa62573263c1f2", - "sha256:3da13c2535b7aea94cc2a6d1b1b37746814c74b6e80790daddd55ca5c120a489", - "sha256:406ec8cfe0c098fadb18d597dc2ee6de4428d640c0ccafa453f3d9b2e58d29e2", - "sha256:4d0db8df9ffae36f416897ad184608d9d7a8c2b46c4612c6bc759b26c073f750", - "sha256:530756d2faa40af4c1f74123e1d889bd07feae45bac2fd32f259a35f7aa74151", - "sha256:77931df40bb5ce5e13f4de2bfc982b2ddc0198971fbd947776c8bb5050896eb2", - "sha256:797a36bd1f69df9e2798e33edb4bd04e5a30478efc08f9428c087f17f65a7045", - "sha256:8085bd0ad2034352eee4d4f3e2da985c2749cb7344b939f4d95ead38c2520859", - "sha256:8536bc08d130cae6dcba1ea689f2913dfd332d06113904d171f2f56da6228e89", - "sha256:a4d412eba5679ede84b41dbe48b1bed8f33131ab9db06c238a235334733acc5e", - "sha256:aebecde2adc4a6847094d3bd6a8a9538ef3438a5ea84ac1983fcb167db614461", - "sha256:b276cc4deb4a80f9dfd47a41ebb464b1fe91efd8b1b8620cf5ccf8b824b850d6", - "sha256:b5a185ae79f899b01ca49f365bdf15a45d78d9856f09b0de1a41b92afce1a07f", - "sha256:c4d8977ccda886d88dc3ca789de2f1adc714df912ff3934b3d0a3f3d777deafb", - "sha256:c5dd3ffa663c982d7f1be9eb494a8924f6d40e2e2f7d1d27384cfab1b2ac0662", - "sha256:ca88f2f7020002638276439a01ffbb0355634907d1aa5ca91f3dc0c2e44e8f3b", - "sha256:d2cce1c82a7845d7e2e8a0956c6b7ed3f1661c9acf18eb120fc71e098ab5c6fe", - "sha256:d709572d64825d8d59ea112e11cc7faf6007f294e9951324b7574af4251e4de8", - "sha256:da8db8374295fb532b4b0c467e66800ef17d100e4d5faa2bbbd6df35502da125", - "sha256:e36c7e3b5382cd5669cf199c4a04a0279a43b2a3bdd77627e9b89778ac9ec08c", - "sha256:e95a4a6c54d27a84a4624d2af8bb9ee178111604653194ca6880c98dcad92f48", - "sha256:ee835def05622e0c8b1435a906491760a43d0c462f065ec9143ec4b8d79f8bff", - "sha256:f75009715dcf4a3d680c2338ab19dac5498f8121173a929872950f4fb3a48fbf", - "sha256:f8524b8bc89470cec7ac51734907818d3620fb1637f8f8b542d650ebec42a126" + "sha256:04cc393045a8f19dd110c975e30f38ed7ab3faf21ede415ea67afebd95a22380", + "sha256:0776bfaf2c48154ab54ea45392847c1283d2fcf64e232e85565f858baedfc1fa", + "sha256:0fadb9f7fa3150577800eef35f62a8a24b9ddf1563ff060d9bd3af22d3952c8c", + "sha256:18e2ab4813883ae63396c0ffe50b13554b32bb69ec56f0afaf052e7a7ae0d55b", + "sha256:191e73bc84a8064ad1874dba0ebadedd7cce4dedee998549518f2c74a003b2e1", + "sha256:35a8f7afe1867118330e2e0e0bf759c409e28557fb1fc2fbb1c6c937297dbe9a", + "sha256:3709f13ca3852b0b07fc04a2c03b379189232b24007c466be0f605dd4723e9d4", + "sha256:4540904c09704b6f831059c0dfb38584acb82cb97b0125cd52688c1f1e3fffa6", + "sha256:463119d7d22d0fc04a0f9122e9d3e6121c6648bcb12a052b51bd1eed1b996aa2", + "sha256:46b3f05f2f7ac7841053da4e0f69616929ca3c42f238c405f6c3df7759ad2780", + "sha256:48697790203909fab02a33226fda546604f4e2653f9d47bc5d3eb40879fa7c64", + "sha256:5676a132169a1c1a3712edf25250722ebc8c9102aa9abd814df063ca8362454f", + "sha256:65204412d0c6a8e3c41e21e93a5e6054a74fea501afa03046a388cf042e3377a", + "sha256:67e1e6a92151023ccdfcfbc0afb3314ad30080793b4c27956ea06ab1fb9bcd8a", + "sha256:6f5b6ba8aefd624834bc177a2ac292734996bb030f9d1b388e7504103b6fcddf", + "sha256:7341f1bb2dadb0d1a0047f34c3a58208a92423cdbd3244d998e4b28df5eac0ed", + "sha256:78d9621cf0ea35abf2d38fa2ca6d0634eab6c991a78373498ab149953787e5e5", + "sha256:8eecdf9cdc7343001d047f951b9cc805cd68cb6cd77b20ea46af5bffc5bd3dfb", + "sha256:94c7b60e1f52e1a87715571327baea0733708ab4723346598beca4a3b6879794", + "sha256:996e1ba717077ce1e6d4849af7a1426f38b07b3d173b879e27d5e26d2e958beb", + "sha256:a07a64709e366c2041cd5cfbca592b43998bf4df88f7b0ca73dca37071ccf1bd", + "sha256:b6306403228edde6e289f626a3908a2f7f67c344e712cf7c0a508bab3ad9e381", + "sha256:b9279adc16e4b0f590ceff581f53a80179b02cba9056010d733eb4196134a870", + "sha256:c4cb9cb492ea7dcdf222a8d19a1d09002798ea516aeae8877245206d27326d86", + "sha256:dd452a5af7014e866206d41751886c9b4bf379a339fdf2dbfc7dd16c0fb4f8e0", + "sha256:e2b12968522a0358b8917fc7b28865acac002f02f4c4c6020fcb264d76bfd06d", + "sha256:e3164a18348bd53c69b4435ebfb4ac8a4076291ffa2a70b54f0c4b80c7834b1d", + "sha256:e47bf8776a7e15576887f04314f5228c6527b99946e6638cf2f16da56d260cab", + "sha256:f8be976cec59b11f011f790b88aca67b4ea2bd286578d0bd3e31bcd19afcd3e4", + "sha256:fc9bc7a9b79fe5c750fc81a307052f8daabb709bdaabb0fb18fb136b66b653b5" ], - "version": "==3.14.1" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==3.15.0" }, "pydantic": { "hashes": [ @@ -792,6 +854,7 @@ "sha256:ea5cb40a3b23b3265f6325727ddfc45141b08ed665458be8c6285e7b85bd73a1", "sha256:fec866a0b59f372b7e776f2d7308511784dace622e0992a0b59ea3ccee0ae833" ], + "markers": "python_full_version >= '3.6.1'", "version": "==1.8.2" }, "pyparsing": { @@ -799,8 +862,17 @@ "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb", "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc" ], + "markers": "python_full_version >= '3.6.8'", "version": "==3.0.9" }, + "pysocks": { + "hashes": [ + "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299", + "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", + "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0" + ], + "version": "==1.7.1" + }, "pytesseract": { "hashes": [ "sha256:7e2bafc7f48d1bb71443ce4633a56f5e21925a98f220a36c336297edcd1956d0", @@ -814,6 +886,7 @@ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.8.2" }, "pytz": { @@ -829,6 +902,7 @@ "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6", "sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d" ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", "version": "==0.1.0.post0" }, "ratelimit": { @@ -915,6 +989,7 @@ "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d", "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4" ], + "markers": "python_version >= '3.6'", "version": "==2022.3.2" }, "requests": { @@ -930,6 +1005,7 @@ "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5", "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a" ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.3.1" }, "rsa": { @@ -937,6 +1013,7 @@ "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17", "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb" ], + "markers": "python_version >= '3.6' and python_version < '4.0'", "version": "==4.8" }, "s3transfer": { @@ -944,13 +1021,23 @@ "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd", "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947" ], + "markers": "python_version >= '3.7'", "version": "==0.6.0" }, + "setuptools": { + "hashes": [ + "sha256:990a4f7861b31532871ab72331e755b5f14efbe52d336ea7f6118144dd478741", + "sha256:c1848f654aea2e3526d17fc3ce6aeaa5e7e24e66e645b5be2171f3f6b4e5a178" + ], + "markers": "python_version >= '3.7'", + "version": "==62.6.0" + }, "six": { "hashes": [ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.16.0" }, "smart-open": { @@ -958,16 +1045,19 @@ "sha256:71d14489da58b60ce12fc3ecb823facc59a8b23cd1b58edb97175640350d3a62", "sha256:75abf758717a92a8f53aa96953f0c245c8cedf8e1e4184903db3659b419d4c17" ], + "markers": "python_version >= '3.6' and python_version < '4.0'", "version": "==5.2.1" }, "snscrape": { - "git": "https://github.com/bellingcat/snscrape" + "editable": true, + "path": "/home/work/Documents/Bellingcat/qanon_project/snscrape" }, "soupsieve": { "hashes": [ "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" ], + "markers": "python_version >= '3.6'", "version": "==2.3.2.post1" }, "spacy": { @@ -997,6 +1087,7 @@ "sha256:4f7dcbc4e6c8e8cb4eadbb009f9c0a1a2a67442e0032c8d6776c9470c3759903", "sha256:dfd58b0cc65b3596cb06f7b95e7bf4fff34668297c59eb179eb050db07b199df" ], + "markers": "python_version >= '3.6'", "version": "==3.0.9" }, "spacy-loggers": { @@ -1004,49 +1095,49 @@ "sha256:d48c9313a577ad1818da961cf6db71a73fd1e556ae47e6e68d7e28b541d11e18", "sha256:e75d44f4cf99e6763d7132ca7c8c420e0a92790222a08bc8eb9e24ea2c13536e" ], + "markers": "python_version >= '3.6'", "version": "==1.0.2" }, "sqlalchemy": { "hashes": [ - "sha256:06ec11a5e6a4b6428167d3ce33b5bd455c020c867dabe3e6951fa98836e0741d", - "sha256:0e7fd52e48e933771f177c2a1a484b06ea03774fc7741651ebdf19985a34037c", - "sha256:139c50b9384e6d32a74fc4dcd0e9717f343ed38f95dbacf832c782c68e3862f3", - "sha256:17417327b87a0f703c9a20180f75e953315207d048159aff51822052f3e33e69", - "sha256:29a742c29fea12259f1d2a9ee2eb7fe4694a85d904a4ac66d15e01177b17ad7f", - "sha256:2aac2a685feb9882d09f457f4e5586c885d578af4e97a2b759e91e8c457cbce5", - "sha256:3197441772dc3b1c6419f13304402f2418a18d7fe78000aa5a026e7100836739", - "sha256:3688f92c62db6c5df268e2264891078f17ecb91e3141b400f2e28d0f75796dea", - "sha256:3862a069a24f354145e01a76c7c720c263d62405fe5bed038c46a7ce900f5dd6", - "sha256:4a17c1a1152ca4c29d992714aa9df3054da3af1598e02134f2e7314a32ef69d8", - "sha256:4c1d9fb3931e27d59166bb5c4dcc911400fee51082cfba66ceb19ac954ade068", - "sha256:4e8706919829d455a9fa687c6bbd1b048e36fec3919a59f2d366247c2bfdbd9c", - "sha256:50c8eaf44c3fed5ba6758d375de25f163e46137c39fda3a72b9ee1d1bb327dfc", - "sha256:5e4e517ce72fad35cce364a01aff165f524449e9c959f1837dc71088afa2824c", - "sha256:6629c79967a6c92e33fad811599adf9bc5cee6e504a1027bbf9cc1b6fb2d276d", - "sha256:78363f400fbda80f866e8e91d37d36fe6313ff847ded08674e272873c1377ea5", - "sha256:7a44683cf97744a405103ef8fdd31199e9d7fc41b4a67e9044523b29541662b0", - "sha256:7e579d6e281cc937bdb59917017ab98e618502067e04efb1d24ac168925e1d2a", - "sha256:7ee34c85cbda7779d66abac392c306ec78c13f5c73a1f01b8b767916d4895d23", - "sha256:8b38e088659b30c2ca0af63e5d139fad1779a7925d75075a08717a21c406c0f6", - "sha256:9785d6f962d2c925aeb06a7539ac9d16608877da6aeaaf341984b3693ae80a02", - "sha256:a91d0668cada27352432f15b92ac3d43e34d8f30973fa8b86f5e9fddee928f3b", - "sha256:a940c551cfbd2e1e646ceea2777944425f5c3edff914bc808fe734d9e66f8d71", - "sha256:aaa0e90e527066409c2ea5676282cf4afb4a40bb9dce0f56c8ec2768bff22a6e", - "sha256:b4c92823889cf9846b972ee6db30c0e3a92c0ddfc76c6060a6cda467aa5fb694", - "sha256:b55932fd0e81b43f4aff397c8ad0b3c038f540af37930423ab8f47a20b117e4c", - "sha256:c37885f83b59e248bebe2b35beabfbea398cb40960cdc6d3a76eac863d4e1938", - "sha256:caca6acf3f90893d7712ae2c6616ecfeac3581b4cc677c928a330ce6fbad4319", - "sha256:cffc67cdd07f0e109a1fc83e333972ae423ea5ad414585b63275b66b870ea62b", - "sha256:d4c3b009c9220ae6e33f17b45f43fb46b9a1d281d76118405af13e26376f2e11", - "sha256:d58f2d9d1a4b1459e8956a0153a4119da80f54ee5a9ea623cd568e99459a3ef1", - "sha256:d6927c9e3965b194acf75c8e0fb270b4d54512db171f65faae15ef418721996e", - "sha256:d9050b0c4a7f5538650c74aaba5c80cd64450e41c206f43ea6d194ae6d060ff9", - "sha256:eec39a17bab3f69c44c9df4e0ed87c7306f2d2bf1eca3070af644927ec4199fa", - "sha256:f9940528bf9c4df9e3c3872d23078b6b2da6431c19565637c09f1b88a427a684", - "sha256:ffe487570f47536b96eff5ef2b84034a8ba4e19aab5ab7647e677d94a119ea55" + "sha256:07865d93e4ca77b59a5ce0f36fbae8161f7dfe57ba17934a3e442cf95dcb3c49", + "sha256:1ac6b091b322ec54a30c751dfcb736987e317f5c53a5cf3beb62e11a18210319", + "sha256:380e09881cdf3c87e90b8995425f7ea618e6bbd33c6b7c9234af21c4b6b3c143", + "sha256:3abe087b641788abbbe94abbf9f15f50bb985f72c0669ef35d1941d2912a276d", + "sha256:42810e560b57e981ed0a947b65a4936b398b4fca97e5b56e10a9c5a151568de2", + "sha256:42a60988aad143a4b2745711548833f57340d7f35586160140361314a509e6f7", + "sha256:470fd9d820fbd25c2a2a2929327c44aaff9d5871a20e0cadd32d293540817517", + "sha256:492f25432f0a998bcaa35e907f9d33f436d208326bb1e6c0f8485e8117502a3d", + "sha256:55c09559e45d3f067435620195238f983d4a23f796650f959f19964ba9104c6f", + "sha256:57ea67a9206eab2abe130e4fdae0662f10cca3dc72ba27553f70a7d613588571", + "sha256:63f8e68356b53072a653e8f61c5f1c19721469af4dbfdb3e3356073e9918f1fe", + "sha256:6edadd6a0a722c22558e1d1f5360d3e85fa938bc69d9049d29968a643de6dd34", + "sha256:737f4feee88d78230fa38027ad5645cb327fe9aac0dd0bde3f8fa7026ed81910", + "sha256:77831317da71adec7b785ebf9e6467b59ba1e186de1ba13c94b4e4951387ba64", + "sha256:82701a4cbb14affc6c1ae62dcebdaff65611b7c7f96f9d0e92a34a8be112a8fa", + "sha256:93ae1d2ef42fbf0f0b3d44b35225bda123310df4b33c9bf662e7b50a68c48a98", + "sha256:97ba370e31b70be94f2f1e85494a5c90f8cf50381ddc02ab95a33a4a86371e02", + "sha256:a57edcbbb45e8307153c5d4635407df71529ed263666064c0524b0c412778306", + "sha256:ad2447f17425e6889f0fb2b229844799aabafc90ff780123067fc5846a30992c", + "sha256:b33388891faf67d0c4a7bb65657dd1a068168eda4b793cb929c4c3894adfdcf2", + "sha256:b8cd779ef29718f3d2c558042ccc45c03006c599dd722fb760faca641a2f32ac", + "sha256:bdea12b997b174903292cf19f40d36cad46b44b645725b9485164684d1849bfd", + "sha256:bf05b312bf0165f92fa0eb09e7661c26f2f06c7a89694ecb79fa15a933deb768", + "sha256:c715347cac3b1c563941162fbbf751d3a5e0c356a33cb20925699f4910504a8f", + "sha256:cd1aba14bbb1ecfe8b5cc52dc840a7e071cfcce6bff545037cf56714c48dfc92", + "sha256:cf1afb1deec19de7ba282062de8a8c4f931ef120faa8b3dc6fca826bbc2f6a9d", + "sha256:cfdb1b3763aa4bddccd7b627b9466fce94952dc150a49309eb56e5f50dd00806", + "sha256:d3c4191e0348428b127c4c2e25ec9c1e8e895e3c6d9a7f083fca28dce23257ee", + "sha256:d8193b4a340d868f2daeeb856dfae9d9d4b011f249128380a83ee7342a887bdb", + "sha256:da424c8b285da91733fed2dd40fed7db076818a62859244d311b80fc8ba4d75e", + "sha256:e44e5f4d84861f4a2a00da8e55712db0dd2ec3d680544fb5d3ac84d3682d7d4c", + "sha256:eba2c5f717fa6d7be040bbc1e4334f1827d31e672cfd53ddbd995935d43e517e", + "sha256:f036bdc951b0d64c64ae83e7ff83a1848eea74f1c6e42461347caab2ed7282b9", + "sha256:f04789d723fbd6214a63006b4711d7afca37630473edb6ab972c5df2b43b7a56", + "sha256:fa64578158cb374e4dd6da2377f1ceabf9973313d171e67fc01a353aa8967858" ], "index": "pypi", - "version": "==1.4.37" + "version": "==1.4.38" }, "srsly": { "hashes": [ @@ -1070,6 +1161,7 @@ "sha256:f96af9fde9f58d5923091fa723fa0fed58a83781b98e143a5d1fac5e738b9f0d", "sha256:fb08416fd6ef04c51fdeefd6d28592b64563b2853243c571a9b0d67403b5be7f" ], + "markers": "python_version >= '3.6'", "version": "==2.4.3" }, "telethon": { @@ -1102,6 +1194,7 @@ "sha256:eba973fe229e7fa86b99f2c5e2724f7f19040ac75a8ef7c8b23b434dac1eadea", "sha256:fd2d49a80a6c95be4eb0f8370a22eef903ecad10b65762d39c9b192abf905f7c" ], + "markers": "python_version >= '3.6'", "version": "==8.0.17" }, "tqdm": { @@ -1117,6 +1210,7 @@ "sha256:5646aef0d936b2c761a10393f0384ee6b5c7fe0bb3e5cd710b17134ca1d99cff", "sha256:e8467f0ebac0c81366c2168d6ad9f888efdfb6d4e1d3d5b4a004f46fa444b5c3" ], + "markers": "python_version >= '3.6'", "version": "==0.4.1" }, "typing-extensions": { @@ -1124,6 +1218,7 @@ "sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708", "sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376" ], + "markers": "python_version >= '3.7'", "version": "==4.2.0" }, "tzdata": { @@ -1139,6 +1234,7 @@ "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745", "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7" ], + "markers": "python_version >= '3.6'", "version": "==4.2" }, "urllib3": { @@ -1146,6 +1242,7 @@ "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", "version": "==1.26.9" }, "wasabi": { @@ -1206,15 +1303,16 @@ "sha256:fab7c640815812ed5f10fbee7abbf58788d602046b7bb3af9b1ac753a6d5e916", "sha256:fc06cc8073c8e87072138ba1e431300e2d408f054b27047d047b549455066ff4" ], + "markers": "python_version >= '3.7'", "version": "==10.3" }, "yt-dlp": { "hashes": [ - "sha256:3a7b59d2fb4b39ce8ba8e0b9c5a37fe20e5624f46a2346b4ae66ab1320e35134", - "sha256:deec1009442312c1e2ee5298966842194d0e950b433f0d4fc844ef464b9c32a7" + "sha256:3f2899b9082b50c890ce10beda4493fbbc016f4d73b924dfc26be670e5cae9fa", + "sha256:ee401a9dcc7e9285b14f13229c3dcefdf387e597f4f4f773dab326aafe3b830c" ], "index": "pypi", - "version": "==2022.5.18" + "version": "==2022.6.22.1" } }, "develop": { @@ -1230,14 +1328,16 @@ "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4", "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd" ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==21.4.0" }, "babel": { "hashes": [ - "sha256:3f349e85ad3154559ac4930c3918247d319f21910d5ce4b25d439ed8693b98d2", - "sha256:98aeaca086133efb3e1e2aad0396987490c8425929ddbcfe0550184fdc54cd13" + "sha256:7614553711ee97490f732126dc077f8d0ae084ebc6a96e23db1482afabdb2c51", + "sha256:ff56f4892c1c4bf0d814575ea23471c230d544203c7748e8c68f0089478d48eb" ], - "version": "==2.10.1" + "markers": "python_version >= '3.6'", + "version": "==2.10.3" }, "black": { "hashes": [ @@ -1270,16 +1370,18 @@ }, "certifi": { "hashes": [ - "sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7", - "sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a" + "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d", + "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412" ], - "version": "==2022.5.18.1" + "markers": "python_version >= '3.6'", + "version": "==2022.6.15" }, "charset-normalizer": { "hashes": [ "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" ], + "markers": "python_version >= '3.5'", "version": "==2.0.12" }, "click": { @@ -1287,9 +1389,13 @@ "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1", "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb" ], + "markers": "python_version >= '3.6'", "version": "==8.0.4" }, "coverage": { + "extras": [ + "toml" + ], "hashes": [ "sha256:01c5615d13f3dd3aa8543afc069e5319cfa0c7d712f6e04b920431e5c564a749", "sha256:106c16dfe494de3193ec55cac9640dd039b66e196e4641fa8ac396181578b982", @@ -1333,6 +1439,7 @@ "sha256:fdb6f7bd51c2d1714cea40718f6149ad9be6a2ee7d93b19e9f00934c0f2a74d9", "sha256:ffa9297c3a453fba4717d06df579af42ab9a28022444cae7fa605af4df612d54" ], + "markers": "python_version >= '3.7'", "version": "==6.4.1" }, "docutils": { @@ -1340,6 +1447,7 @@ "sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125", "sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61" ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==0.17.1" }, "idna": { @@ -1347,6 +1455,7 @@ "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" ], + "markers": "python_version >= '3.5'", "version": "==3.3" }, "imagesize": { @@ -1354,6 +1463,7 @@ "sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c", "sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d" ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.3.0" }, "importlib-metadata": { @@ -1376,6 +1486,7 @@ "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852", "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61" ], + "markers": "python_version >= '3.7'", "version": "==3.1.2" }, "markupsafe": { @@ -1421,6 +1532,7 @@ "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a", "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7" ], + "markers": "python_version >= '3.7'", "version": "==2.1.1" }, "mypy-extensions": { @@ -1435,6 +1547,7 @@ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522" ], + "markers": "python_version >= '3.6'", "version": "==21.3" }, "pathspec": { @@ -1449,6 +1562,7 @@ "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788", "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19" ], + "markers": "python_version >= '3.7'", "version": "==2.5.2" }, "pluggy": { @@ -1456,6 +1570,7 @@ "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159", "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3" ], + "markers": "python_version >= '3.6'", "version": "==1.0.0" }, "py": { @@ -1463,6 +1578,7 @@ "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378" ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==1.11.0" }, "pygments": { @@ -1470,6 +1586,7 @@ "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb", "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519" ], + "markers": "python_version >= '3.6'", "version": "==2.12.0" }, "pyparsing": { @@ -1477,6 +1594,7 @@ "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb", "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc" ], + "markers": "python_full_version >= '3.6.8'", "version": "==3.0.9" }, "pytest": { @@ -1536,11 +1654,11 @@ }, "sphinx": { "hashes": [ - "sha256:36aa2a3c2f6d5230be94585bc5d74badd5f9ed8f3388b8eedc1726fe45b1ad30", - "sha256:f4da1187785a5bc7312cc271b0e867a93946c319d106363e102936a3d9857306" + "sha256:b18e978ea7565720f26019c702cd85c84376e948370f1cd43d60265010e1c7b0", + "sha256:d3e57663eed1d7c5c50895d191fdeda0b54ded6f44d5621b50709466c338d1e8" ], "index": "pypi", - "version": "==5.0.1" + "version": "==5.0.2" }, "sphinx-rtd-theme": { "hashes": [ @@ -1555,6 +1673,7 @@ "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a", "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58" ], + "markers": "python_version >= '3.5'", "version": "==1.0.2" }, "sphinxcontrib-devhelp": { @@ -1562,6 +1681,7 @@ "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4" ], + "markers": "python_version >= '3.5'", "version": "==1.0.2" }, "sphinxcontrib-htmlhelp": { @@ -1569,6 +1689,7 @@ "sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07", "sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2" ], + "markers": "python_version >= '3.6'", "version": "==2.0.0" }, "sphinxcontrib-jsmath": { @@ -1576,6 +1697,7 @@ "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" ], + "markers": "python_version >= '3.5'", "version": "==1.0.1" }, "sphinxcontrib-qthelp": { @@ -1583,6 +1705,7 @@ "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6" ], + "markers": "python_version >= '3.5'", "version": "==1.0.3" }, "sphinxcontrib-serializinghtml": { @@ -1590,6 +1713,7 @@ "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd", "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952" ], + "markers": "python_version >= '3.5'", "version": "==1.1.5" }, "tomli": { @@ -1597,6 +1721,7 @@ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" ], + "markers": "python_full_version < '3.11.0'", "version": "==2.0.1" }, "typing-extensions": { @@ -1604,6 +1729,7 @@ "sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708", "sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376" ], + "markers": "python_version >= '3.7'", "version": "==4.2.0" }, "urllib3": { @@ -1611,6 +1737,7 @@ "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", "version": "==1.26.9" }, "zipp": { @@ -1618,6 +1745,7 @@ "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad", "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099" ], + "markers": "python_version >= '3.7'", "version": "==3.8.0" } } diff --git a/app.py b/app.py index 9d848f8..14f2ad6 100644 --- a/app.py +++ b/app.py @@ -40,11 +40,7 @@ def get_scraper_controller(): controller = ScraperController() controller.connect_to_db(engine) - scrapers = [VkontakteScraper(), - TelegramTelethonScraper(), - GettrScraper(), - BitchuteScraper(), - RumbleScraper()] + scrapers = [TelegramTelethonScraper(),] controller.register_scrapers(scrapers) @@ -73,6 +69,14 @@ def scrape_channels(args): controller = get_scraper_controller() controller.scrape_all_channels(archive_media=args.media) +def import_paths(args): + logger.info(f"Importing paths, media: {args.media}") + if len(args.paths) == 0: + logger.warning(f"No paths specified") + offset = args.offset or 0 + + controller = get_scraper_controller() + controller.import_all_paths(paths=args.paths, offset = offset, archive_media=args.media) def scrape_channel_info(args): logger.info(f"Scraping channel info") @@ -121,6 +125,14 @@ if __name__ == "__main__": "--media", action="store_true", help="[scrape-channels] Add this flag to media" ) + parser.add_argument( + "--paths", nargs = '+', help="[import-paths] Add this flag to specify paths of exported posts to be imported" + ) + + parser.add_argument( + "--offset", type = int, help="[import-paths] Add this flag to specify the file number in the specified paths to start importing" + ) + args = parser.parse_args() if args.command == "init-db": @@ -131,6 +143,9 @@ if __name__ == "__main__": elif args.command == "scrape-channels": logger.add("logs/scrape-channels.log", level="TRACE", rotation="100 MB") scrape_channels(args) + elif args.command == "import-paths": + logger.add("logs/import-paths.log", level="TRACE", rotation="100 MB") + import_paths(args) elif args.command == "archive-media": logger.add("logs/archive-media.log", level="TRACE", rotation="100 MB") archive_media(args) diff --git a/cisticola/base.py b/cisticola/base.py index 28ff9f8..fd7b897 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -11,7 +11,7 @@ import pytesseract import PIL import exiftool import re -from langdetect import detect, DetectorFactory +from langdetect import PROFILES_DIRECTORY, DetectorFactory from langdetect.lang_detect_exception import LangDetectException from loguru import logger import spacy @@ -165,6 +165,15 @@ nlp_ru = spacy.load('ru_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ nlp_nl = spacy.load('nl_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler']) nlp_xx = spacy.load('xx_ent_wiki_sm') +factory = DetectorFactory() +factory.load_profile(PROFILES_DIRECTORY) +detector = factory.create() + +def detect(text, detector=detector): + detector.text = "" + detector.append(text) + return detector.detect() + @dataclass class Post: """An object with fields for columns in the analysis table""" diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 973fc55..88073ec 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -16,6 +16,7 @@ from sqlalchemy import nullsfirst from cisticola.base import Channel, RawChannelInfo, ScraperResult, mapper_registry from cisticola.utils import make_request +from cisticola.transformer.base import insert_or_select class Scraper: """Base class for defining platform-specific scrapers for scraping all posts @@ -429,6 +430,34 @@ class ScraperController: if not handled: logger.warning(f"No handler found for Channel {channel}") + session.close() + + def import_all_paths(self, paths: List[Path], offset: int = 0, archive_media: bool = False): + if self.session is None: + logger.error("No DB session") + return + + scraper = None + for _scraper in self.scrapers: + if 'Telegram' in _scraper.__version__: + scraper = _scraper + break + if scraper is None: + raise ValueError('Only Telegram scraper is currently able to import posts') + + session = self.session() + for path in paths: + files = sorted(os.listdir(path)) + for i, file in enumerate(files[offset :]): + logger.info(f"Starting import of posts from channel {file}, [{i + offset}/{len(files)}]") + posts = list(scraper.import_posts(os.path.join(path, file), session = session, insert = lambda obj: insert_or_select(obj, session, False), archive_media=archive_media)) + + session.bulk_save_objects(posts) + session.commit() + + logger.info( + f"{scraper} found {len(posts)} new posts from {file}") + session.close() @logger.catch(reraise = True) diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index 7235011..7212e91 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -5,7 +5,9 @@ import json import tempfile from pathlib import Path import time +import pickle +from sqlalchemy import func from loguru import logger from telethon.sync import TelegramClient from telethon.tl.functions.channels import GetFullChannelRequest @@ -163,6 +165,63 @@ class TelegramTelethonScraper(Scraper): archived_urls=archived_urls, media_archived=datetime.now(timezone.utc) if archive_media else None) + @logger.catch(reraise = True) + def import_posts(self, file: str, session, insert, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + + with open(file, 'rb') as f: + posts = pickle.load(f) + screenname = file.split('/')[-1].split('.')[0] + logger.info(f"Loaded posts from channel {screenname}") + platform_ids = list(set([p.get('to_id', {}).get('channel_id') for p in posts if p['_'] == 'Message'])) or list(set([p.get('peer_id', {}).get('channel_id') for p in posts if p['_'] == 'Message'])) + if len(platform_ids) > 0: + platform_id = platform_ids[0] + else: + return [] + channel = session.query(Channel).filter_by(platform_id=str(platform_id), platform = 'Telegram').first() + if channel is None: + channel = Channel( + name=None, + platform_id=platform_id, + platform='Telegram', + url="https://t.me/s/" + screenname, + screenname=screenname, + category='imported', + source=self.__version__ + ) + channel = insert(channel) + else: + num_posts = session.query(func.count('*')).select_from(ScraperResult).filter(ScraperResult.channel==channel.id).scalar() + if num_posts != 0: + logger.info(f"Found {num_posts} already imported for channel {screenname}, skipping") + return [] + for post in posts: + post_url = f'{channel.url}/{post["id"]}' + + logger.trace(f"Archiving post {post_url} from {post['date']}") + + archived_urls = {} + + if post.get('media') is not None: + archived_urls[post_url] = None + + # if archive_media: + # blob, output_file_with_ext = self.archive_post_media(post, client) + # if blob is not None: + # # TODO specify Content-Type + # archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext) + # archived_urls[post_url] = archived_url + + yield ScraperResult( + scraper=self.__version__, + platform="Telegram", + channel=channel.id, + platform_id=post_url, + date=post['date'].replace(tzinfo=timezone.utc), + date_archived=datetime.now(timezone.utc), + raw_data=json.dumps(post, default=str), + archived_urls=archived_urls, + media_archived=datetime.now(timezone.utc) if archive_media else None) + @logger.catch def get_profile(self, channel: Channel) -> RawChannelInfo: username = TelegramTelethonScraper.get_channel_identifier(channel) diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index 32ed37c..97313da 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -5,6 +5,8 @@ from sqlalchemy.engine.base import Engine from sqlalchemy.sql.expression import func from collections import defaultdict from datetime import datetime +#DEBUG +import time from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Media, Channel, mapper_registry @@ -88,57 +90,6 @@ class ETLController: self.session = sessionmaker() self.session.configure(bind=engine) - def insert_or_select(self, obj, session, hydrate: bool = True): - """Inserts an object into the database or returns an existing object from the database. - Regardless, the resulting object has an `id` attribute that can be referenced later.""" - - instance = None - - # This is using some adhoc unique constraints that might be worth formalizing at some point - if type(obj) == Channel: - instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id or '') or obj.platform_id, platform=obj.platform).first() - - elif type(obj) == Post: - instance = None - # instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first() - - elif issubclass(type(obj), Media): - instance = session.query(type(obj)).filter_by(original_url=obj.original_url, post=obj.post).first() - if instance: - logger.info(f"Found matching DB entry for {obj}: {instance}") - return instance - - instance = session.query(type(obj)).filter_by(original_url=obj.original_url).first() - - # For Media objects we want to duplicate the entry to preserve the relationship with the post. - # However, we also want to avoid rehydration, hence the code below: - if instance: - logger.info(f"Found matching media record, duplicating and inserting for new post") - - session.expunge(instance) - make_transient(instance) - instance.id = None - instance.post = obj.post - instance.raw_id = obj.raw_id - - session.add(instance) - session.flush() - return instance - - if instance: - logger.info(f"Found matching DB entry for {obj}: {instance}") - return instance - - if hydrate: - obj.hydrate() - - session.add(obj) - session.flush() - - logger.trace(f"Inserted new object {obj}") - - return obj - @logger.catch(reraise=True) def transform_results(self, results: List[ScraperResult], hydrate: bool = True): """Transforms raw ScraperResults objects into Post objects and @@ -157,6 +108,10 @@ class ETLController: session = self.session() + transformed_results = [] + # DEBUG + start_time = time.time() + for result in results: if result.scraper is not None and result.platform is not None: for transformer in self.transformers: @@ -166,13 +121,22 @@ class ETLController: logger.trace(f"{transformer} is handling result {result.id} ({result.date})") handled = True - transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session) - - session.commit() + transformed_results.append(transformer.transform(result, lambda obj: insert_or_select(obj, session, hydrate), session)) + + # count += 1 break if handled == False: logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})") + # # DEBUG + # if count == 10000: + # import sys; sys.exit() + + to_transform = list(filter(None, transformed_results)) + session.bulk_save_objects(to_transform) + session.commit() + total_time = time.time() - start_time + logger.info(f'Transformed {len(to_transform)} posts out of {len(results)} ScraperResults in {total_time:.1f} seconds') @logger.catch(reraise=True) def transform_all_untransformed(self, hydrate: bool = True): @@ -192,22 +156,31 @@ class ETLController: session = self.session() BATCH_SIZE = 50000 - offset = 0 - batch = [] + batch = (session.query(ScraperResult) + .join(Post, isouter=True) + .where(Post.raw_id == None) + .order_by(ScraperResult.date.asc()) + .limit(BATCH_SIZE) + ).all() - query = (session.query(ScraperResult) - .join(Post, isouter=True) - .where(Post.raw_id == None) - .order_by(ScraperResult.date.asc()) - ) + while len(batch) > 0: + # logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {offset}") - while len(batch) > 0 or offset == 0: - logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {offset}") + # DEBUG + # import pdb; pdb.set_trace() - batch = query.slice(offset, offset + BATCH_SIZE).all() - offset += BATCH_SIZE + start_time = time.time() + batch = (session.query(ScraperResult) + .join(Post, isouter=True) + .where(Post.raw_id == None) + .where(ScraperResult.date > max(batch, key=lambda v: v.date).date) + .order_by(ScraperResult.date.asc()) + .limit(BATCH_SIZE) + ).all() + total_time = time.time() - start_time + logger.info(f'Retrieved {BATCH_SIZE} ScraperResults in {total_time:.1f} seconds') - logger.info(f"Found {len(batch)} items to ETL ({offset} already processed)") + # logger.info(f"Found {len(batch)} items to ETL ({offset} already processed)") self.transform_results(batch, hydrate=hydrate) @@ -259,8 +232,58 @@ class ETLController: logger.info(f"Fetching untransformed info batch of {BATCH_SIZE}, offset {offset}") batch = query.slice(offset, offset + BATCH_SIZE).all() + logger.info(f"Found {len(batch)} info items to ETL ({offset} already processed)") + + self.transform_info(batch) offset += BATCH_SIZE - logger.info(f"Found {len(batch)} info items to ETL ({offset} already processed)") +def insert_or_select(obj, session, hydrate: bool = True): + """Inserts an object into the database or returns an existing object from the database. + Regardless, the resulting object has an `id` attribute that can be referenced later.""" - self.transform_info(batch) + instance = None + + # This is using some adhoc unique constraints that might be worth formalizing at some point + if type(obj) == Channel: + instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id or '') or obj.platform_id, platform=obj.platform).first() + + elif type(obj) == Post: + instance = None + # instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first() + + elif issubclass(type(obj), Media): + instance = session.query(type(obj)).filter_by(original_url=obj.original_url, post=obj.post).first() + if instance: + logger.info(f"Found matching DB entry for {obj}: {instance}") + return instance + + instance = session.query(type(obj)).filter_by(original_url=obj.original_url).first() + + # For Media objects we want to duplicate the entry to preserve the relationship with the post. + # However, we also want to avoid rehydration, hence the code below: + if instance: + logger.info(f"Found matching media record, duplicating and inserting for new post") + + session.expunge(instance) + make_transient(instance) + instance.id = None + instance.post = obj.post + instance.raw_id = obj.raw_id + + session.add(instance) + session.flush() + return instance + + if instance: + logger.info(f"Found matching DB entry for {obj}: {instance}") + return instance + + if hydrate: + obj.hydrate() + + session.add(obj) + session.flush() + + logger.trace(f"Inserted new object {obj}") + + return obj \ No newline at end of file diff --git a/cisticola/transformer/telegram_telethon.py b/cisticola/transformer/telegram_telethon.py index b5326e2..52c3a47 100644 --- a/cisticola/transformer/telegram_telethon.py +++ b/cisticola/transformer/telegram_telethon.py @@ -24,6 +24,17 @@ class TelegramTelethonTransformer(Transformer): bad_channels = {} + def __init__(self): + super().__init__() + + api_id = os.environ['TELEGRAM_API_ID'] + api_hash = os.environ['TELEGRAM_API_HASH'] + phone = os.environ['TELEGRAM_PHONE'] + + # set up a persistent client for Telethon + self.client = TelegramClient(phone, api_id, api_hash) + self.client.connect() + def can_handle(self, data: ScraperResult) -> bool: scraper = data.scraper.split(' ') if scraper[0] == "TelegramTelethonScraper": @@ -32,16 +43,13 @@ class TelegramTelethonTransformer(Transformer): return False def get_screenname_from_id(self, channel_id): - api_id = os.environ['TELEGRAM_API_ID'] - api_hash = os.environ['TELEGRAM_API_HASH'] try: - with TelegramClient("transform.session", api_id, api_hash) as client: - data = client.get_entity(channel_id) - if isinstance(data, types.User): - return (data.username, str(data.first_name or "") + " " + str(data.last_name or ""), "") - else: - return (data.username, data.title, "") + data = self.client.get_entity(channel_id) + if isinstance(data, types.User): + return (data.username, str(data.first_name or "") + " " + str(data.last_name or ""), "") + else: + return (data.username, data.title, "") except ChannelPrivateError: logger.info("ChannelPrivateError") return ("", "", "ChannelPrivateError") @@ -125,75 +133,94 @@ class TelegramTelethonTransformer(Transformer): raw = json.loads(data.raw_data) if raw['_'] != 'Message': - logger.warning(f"Cannot convert type {raw['_']} to post") + # DEBUG + # logger.warning(f"Cannot convert type {raw['_']} to post") return fwd_from = None - if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']: - channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first() + # channel = True + + # if raw.get('fwd_from') is not None: + # if raw['fwd_from'].get('from_id'): + # if isinstance(raw['fwd_from']['from_id'], int): + # channel_id = str(raw['fwd_from']['from_id']) + # else: + # if 'channel_id' in raw['fwd_from']['from_id']: + # channel_id = str(raw['fwd_from']['from_id']['channel_id']) + # elif 'user_id' in raw['fwd_from']['from_id']: + # channel_id = str(raw['fwd_from']['from_id']['user_id']) + # elif raw['fwd_from'].get('channel_id'): + # channel_id = str(raw['fwd_from']['channel_id']) + # elif raw['fwd_from'].get('user_id'): + # channel_id = str(raw['fwd_from']['user_id']) + # channel = session.query(Channel).filter_by(platform_id=channel_id, platform = 'Telegram').first() - if channel is None: - (screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id']) + # if channel is None: + # (screenname, name, notes) = self.get_screenname_from_id(channel_id) - if name == "": - logger.info("Trying fallback web interface") - orig_channel = session.query(Channel).filter_by(id=data.channel).first() - if orig_channel.screenname is not None: - name = self.get_name_from_web_interface(orig_channel.screenname, raw['id']) + # # if name == "": + # # logger.info("Trying fallback web interface") + # # orig_channel = session.query(Channel).filter_by(id=data.channel).first() + # # if orig_channel.screenname is not None: + # # name = self.get_name_from_web_interface(orig_channel.screenname, raw['id']) - channel = Channel( - name=name, - platform_id=raw['fwd_from']['from_id']['channel_id'], - platform=data.platform, - url="https://t.me/s/" + screenname if screenname is not None else "", - screenname=screenname, - category='forwarded', - source=self.__version__, - notes=notes - ) + # channel = Channel( + # name=name, + # platform_id=channel_id, + # platform=data.platform, + # url="https://t.me/s/" + screenname if screenname not in (None, "") else "", + # screenname=screenname, + # category='forwarded', + # source=self.__version__, + # notes=notes + # ) - channel = insert(channel) - logger.info(f"Added {channel}") + # channel = insert(channel) + # logger.info(f"Added {channel}") - fwd_from = channel.id + # fwd_from = channel.id reply_to = None - if raw['reply_to']: - reply_to_id = str(raw['reply_to']['reply_to_msg_id']) - post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first() - if post is None: - reply_to = -1 - else: - reply_to = post.id + # reply_to_id = None + # if raw.get('reply_to'): + # reply_to_id = str(raw['reply_to']['reply_to_msg_id']) + # elif raw.get('reply_to_msg_id'): + # reply_to_id = str(raw['reply_to_msg_id']) + # if reply_to_id: + # post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first() + # if post is None: + # reply_to = -1 + # else: + # reply_to = post.id mentions = [] - for mention_entity in [entity for entity in raw['entities'] if entity['_'] == 'MessageEntityMention']: + # for mention_entity in [entity for entity in raw['entities'] if entity['_'] == 'MessageEntityMention']: - offset = mention_entity['offset'] - length = mention_entity['length'] + # offset = mention_entity['offset'] + # length = mention_entity['length'] - screenname = add_surrogate(raw['message'])[offset:offset+length].strip('@').strip() + # screenname = add_surrogate(raw['message'])[offset:offset+length].strip('@').strip() - channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(screenname)).first() + # channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(screenname)).first() - if channel is None: + # if channel is None: - channel = Channel( - name = None, - platform_id = None, - platform = 'Telegram', - url="https://t.me/s/" + screenname, - screenname=screenname, - category='mentioned', - source=self.__version__, - ) + # channel = Channel( + # name = None, + # platform_id = None, + # platform = 'Telegram', + # url="https://t.me/s/" + screenname, + # screenname=screenname, + # category='mentioned', + # source=self.__version__, + # ) - channel = insert(channel) - logger.info(f"Added {channel}") + # channel = insert(channel) + # logger.info(f"Added {channel}") - mentions.append(channel.id) + # mentions.append(channel.id) channel = session.query(Channel).filter_by(id=int(data.channel)).first() @@ -225,7 +252,8 @@ class TelegramTelethonTransformer(Transformer): views = raw.get('views') ) - transformed = insert(transformed) + # transformed = insert(transformed) + return transformed # for k in data.archived_urls: # if data.archived_urls[k]: