mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Merge pull request #20 from bellingcat/separate-media-archiving
WIP: Separate media archiving and CLI
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -11,10 +11,11 @@ docs/source/_*
|
||||
*.session
|
||||
service_account.json
|
||||
.vscode/
|
||||
*.log
|
||||
|
||||
# Unit test / coverage reports
|
||||
reports
|
||||
.coverage
|
||||
.coverage*
|
||||
.cache
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
5
Pipfile
5
Pipfile
@@ -22,6 +22,9 @@ instaloader = "*"
|
||||
gspread = "*"
|
||||
cryptg = "*"
|
||||
gabber = {git = "https://github.com/stanfordio/gabber.git"}
|
||||
psycopg2-binary = "*"
|
||||
tqdm = "*"
|
||||
ratelimit = "*"
|
||||
|
||||
[dev-packages]
|
||||
pytest = "*"
|
||||
@@ -33,7 +36,7 @@ sphinx = "*"
|
||||
sphinx_rtd_theme = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
python_version = "3.8"
|
||||
|
||||
[pipenv]
|
||||
allow_prereleases = true
|
||||
|
||||
246
Pipfile.lock
generated
246
Pipfile.lock
generated
@@ -1,11 +1,11 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "b712e767d64e54e83e8c2d8a27a68203583ed7ad31d4ea3b4b6076a72a2150fd"
|
||||
"sha256": "e57f79178ac0e05f9753a29f97e08d2ae96b7775044bb4c6ba616baae1d21183"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_version": "3.9"
|
||||
"python_version": "3.8"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
@@ -16,6 +16,28 @@
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"backports.zoneinfo": {
|
||||
"hashes": [
|
||||
"sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf",
|
||||
"sha256:1b13e654a55cd45672cb54ed12148cd33628f672548f373963b0bff67b217328",
|
||||
"sha256:1c5742112073a563c81f786e77514969acb58649bcdf6cdf0b4ed31a348d4546",
|
||||
"sha256:4a0f800587060bf8880f954dbef70de6c11bbe59c673c3d818921f042f9954a6",
|
||||
"sha256:5c144945a7752ca544b4b78c8c41544cdfaf9786f25fe5ffb10e838e19a27570",
|
||||
"sha256:7b0a64cda4145548fed9efc10322770f929b944ce5cee6c0dfe0c87bf4c0c8c9",
|
||||
"sha256:8439c030a11780786a2002261569bdf362264f605dfa4d65090b64b05c9f79a7",
|
||||
"sha256:8961c0f32cd0336fb8e8ead11a1f8cd99ec07145ec2931122faaac1c8f7fd987",
|
||||
"sha256:89a48c0d158a3cc3f654da4c2de1ceba85263fafb861b98b59040a5086259722",
|
||||
"sha256:a76b38c52400b762e48131494ba26be363491ac4f9a04c1b7e92483d169f6582",
|
||||
"sha256:da6013fd84a690242c310d77ddb8441a559e9cb3d3d59ebac9aca1a57b2e18bc",
|
||||
"sha256:e55b384612d93be96506932a786bbcde5a2db7a9e6a4bb4bffe8b733f5b9036b",
|
||||
"sha256:e81b76cace8eda1fca50e345242ba977f9be6ae3945af8d46326d776b4cf78d1",
|
||||
"sha256:e8236383a20872c0cdf5a62b554b27538db7fa1bbec52429d8d106effbaeca08",
|
||||
"sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac",
|
||||
"sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2"
|
||||
],
|
||||
"markers": "python_version >= '3.6' and python_version < '3.9'",
|
||||
"version": "==0.2.1"
|
||||
},
|
||||
"beautifulsoup4": {
|
||||
"hashes": [
|
||||
"sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf",
|
||||
@@ -26,19 +48,19 @@
|
||||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:127ebdf58c8825b53f1eff111e08c49ffffeb1f6d7a5665c9907ce8128fe14b1",
|
||||
"sha256:b7ce3bf013f0f60e40c2676d5a7b620ed927cfad0aa348a606b10e9a0387f249"
|
||||
"sha256:ef210f8e85cdb6d26a38ebad1cfe9cefdef2ab269207e5987653555375a7ef6b",
|
||||
"sha256:f0af8f4ef5fe6353c794cd3cce627d469a618b58ace7ca75a63cfd719df615ce"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.21.29"
|
||||
"version": "==1.21.30"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:b467d64cd773dc4d49ef31b18a8dded554f284f799720bd12e989fe2138fd5b8",
|
||||
"sha256:de87907d42682179946ddfa113b9334e3c4258404aef19edd8c92381ff54775c"
|
||||
"sha256:af4bdc51eeecbe9fdcdadbed9ad58c5c91380ef30f3560022bbc2ee1d78f0ad6",
|
||||
"sha256:c622751093e3d0bf61343e66d6d06190ef30bf42b1557d5070ca84e9efa06d4b"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.24.29"
|
||||
"version": "==1.24.30"
|
||||
},
|
||||
"brotli": {
|
||||
"hashes": [
|
||||
@@ -195,11 +217,11 @@
|
||||
},
|
||||
"click": {
|
||||
"hashes": [
|
||||
"sha256:19a4baa64da924c5e0cd889aba8e947f280309f1a2ce0947a3e3a7bcb7cc72d6",
|
||||
"sha256:977c213473c7665d3aa092b41ff12063227751c41d7b17165013e10069cc5cd2"
|
||||
"sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b",
|
||||
"sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==8.1.0"
|
||||
"version": "==8.1.1"
|
||||
},
|
||||
"cryptg": {
|
||||
"hashes": [
|
||||
@@ -302,63 +324,64 @@
|
||||
},
|
||||
"greenlet": {
|
||||
"hashes": [
|
||||
"sha256:004aed447382d80a56ecc354a6d807f305e6c808714ce6ccbca4839c94fae81d",
|
||||
"sha256:068d68fad6bd623e29a2d36e74538c9b9d6dc6464931cd27d93da6cfc6a7f242",
|
||||
"sha256:06fd4075754009c9817c6b4e1dc0af4616de52757b6ca973a81c3c1aadc28257",
|
||||
"sha256:1004cb542451814b12a4f38e835a47734e2b2c683acbf463d5ae76282a3974cf",
|
||||
"sha256:10c358633a8b27bfc32d27114ef2ca2ddc9f1f89f1643d1157b85e1fdd695315",
|
||||
"sha256:115bc25fefbdc692c4483e9ddb9011ccd0251590ed59dbfff0f4eb7050bf99c4",
|
||||
"sha256:1d987a2579336792f73ae6b106c2f087e32afc8573fbf9566f123ac6d8cfb72f",
|
||||
"sha256:2128d727fd1e8afba8e68feb2cdcf88c90163b69ddc9707722a3e491c5280720",
|
||||
"sha256:230132c241fe284f93f2e7b3969e9b22bbd76ef98cf93e382c945d378907f5a4",
|
||||
"sha256:23558f7bd08a663386c032ab8d302d613d2d02ae0c9758ad410bab6035b58d3d",
|
||||
"sha256:255d520d3e4a5f16883b182e1a94219fe455ab4f50aaaf534bfd6d64ee728397",
|
||||
"sha256:2a6bc19a728f6f643cfc89b876159a1a25a8f7d8700c013d48a73691f80b4550",
|
||||
"sha256:379bed346ef8ba0a0e698b3c5975a44d15dd4a5bbff40bbd7fd548b445d5550b",
|
||||
"sha256:3b12d0866759db93b0a893b4e50a7d7d1681519d2346c26695bb8bb2c652230e",
|
||||
"sha256:40d491944f69e350e1e8b25f6ca49459824ede1678ec0cd4b5541f41edc06614",
|
||||
"sha256:471484c7b9d7b7867263051aa81cdeed6e06b455e629a7f05eb91a6cb8bd0836",
|
||||
"sha256:488c557080557bc01aabb3e1bda7225c68455b853733a8652857ac0d810dad1b",
|
||||
"sha256:49c2e76e7aa81ba889b3c183e2341af3cc6161ee38852085110ae49d5b5d9a40",
|
||||
"sha256:52d13ec90236e5935ed6da044e78faa1371d5116cc43fe6d7ca8994dd619ef96",
|
||||
"sha256:57898c69a253d81f487787bdd538629fabd671fab8a9e31b041ca30965fd9556",
|
||||
"sha256:5d577eef5beb5730ef01ab39983eb852a97c359b7a546809adf70c409f4b2ecc",
|
||||
"sha256:6a41987c1474c9158a0c0c96611530a8f299bc547d35bee8add981b8b2534f74",
|
||||
"sha256:6ae67b7df8db3626af8e042e9c6949cfa27d1a3bbbfdff29e45b72bb6673a650",
|
||||
"sha256:6c42c27e9d12e8a481aff469ffe8dd4ce0484c354a418470960f760f6ae41e7c",
|
||||
"sha256:6c4a90c9f6128b4d0905a89930bd325e0491574e5cb453f606bb7094a3197587",
|
||||
"sha256:6e64518e5833ac2d9359b6d9bd4df2c0cf441a0f3a4eca9e735fbea99009fa70",
|
||||
"sha256:6fd3a270c23c5b42d86a9c7c6b0229f23ee4a7a4cabdaaa1693ad7a0982d13cb",
|
||||
"sha256:70db73351e0fcf11a76288c47a0469d9a330bcb2e7618c5eb57432b8caa82403",
|
||||
"sha256:771f401692046845626cbdf1dd0f04e999413ede0ee9ad39033fe30b5fa2e845",
|
||||
"sha256:7935026ec61b967cbc6b746c0ca75c1651ea118d7fee4d259cff9e6866153374",
|
||||
"sha256:7b76b1cac9baac1980210e29145800954e7b42e91ef69c4d695de1cab87ce41f",
|
||||
"sha256:7e3f37c11b6699b1a1e0fcc0e88829dba4f2866546381b05ab8b3f4db645a823",
|
||||
"sha256:8370fa65ad421484894f559055f951843754153b72b9bca2ebdc5288efe2e3f0",
|
||||
"sha256:8ae9c443d44a4e23252632e4d7775f419f992d0df3eff923e23775f5cc551d39",
|
||||
"sha256:8b31d85f2781e44f1ffaaf7ea07f484e7d42317c677c355fa77b4a1a4bea7394",
|
||||
"sha256:8b450336b27f3b375cadc474c6704838eaa8dd3ca312aac3bb69d92264a8e638",
|
||||
"sha256:9ce84357388a76d886febff4e50e321c212ffd3248b590960b2da6e02404a5c9",
|
||||
"sha256:a23e986fb0ba8e7407286add41fa0d4207be44e3dce1b04789f4757800eca1cf",
|
||||
"sha256:a81610ee00d0da9cd2c8679479b7791149365b6dfb3971b01b22ee29b04787ce",
|
||||
"sha256:b4e40444975e5ab0ed3004369209c39a28e084951daaeee4919f164b6b849b14",
|
||||
"sha256:b66600de16702b9dfa74bea34524b55183a2183e5fd92f20fe6c2fcae550a64c",
|
||||
"sha256:ba6ee18694d3673796b7a31b7d21254e87e9e43ca5be56f323fd396111255315",
|
||||
"sha256:bd03837da28293baa39bdfc3cada69e2f8807f423ae06168aa28d2b32c63a6b6",
|
||||
"sha256:bd2192070f88c0778ae1d68a0980fdece3473498c1db37f3794e3454f91e3ecf",
|
||||
"sha256:c1f6f1a3cc013012cd1da913c40b13e6d721046a8c8a0ea0cde94069645a75db",
|
||||
"sha256:ce10a8e7e067bde3c1fbf494d2b8859db510206030b0b67bc3af90b0eb1887b9",
|
||||
"sha256:d31386d208303a5a6cf0819ef9f6db6680bab9e4ca8e48adb3d4b26ead89beb7",
|
||||
"sha256:d83b3af53b201970973c5574b39df226746194063bb248a53fd12b470ac34319",
|
||||
"sha256:df9657b212c054ac6d803290d7c4bcd7790af0b725984fce1eeb0a1e3f2d9798",
|
||||
"sha256:e576e5fd3f129e6b3595dc734ac7f2b8c548f19ef07781194bc538dc9c0cdbbc",
|
||||
"sha256:e7400358558094c1bcedc75f3b3c4f400c53130b44833848890a99968dee6a64",
|
||||
"sha256:eb6a385f8577d30e4cb43dd555fb134ddaae1edeb84205e09dabec332bf49fd0",
|
||||
"sha256:f27f0875e0873f6bf5df09a456bfcac0667824cabac4cad30b43f36e0382ffe7",
|
||||
"sha256:fcd4a6d04995f1d66bc78b503e4e59ae72fd32aaec4f661657fe5ae5c1aa4ce3"
|
||||
"sha256:0051c6f1f27cb756ffc0ffbac7d2cd48cb0362ac1736871399a739b2885134d3",
|
||||
"sha256:00e44c8afdbe5467e4f7b5851be223be68adb4272f44696ee71fe46b7036a711",
|
||||
"sha256:013d61294b6cd8fe3242932c1c5e36e5d1db2c8afb58606c5a67efce62c1f5fd",
|
||||
"sha256:049fe7579230e44daef03a259faa24511d10ebfa44f69411d99e6a184fe68073",
|
||||
"sha256:14d4f3cd4e8b524ae9b8aa567858beed70c392fdec26dbdb0a8a418392e71708",
|
||||
"sha256:166eac03e48784a6a6e0e5f041cfebb1ab400b394db188c48b3a84737f505b67",
|
||||
"sha256:17ff94e7a83aa8671a25bf5b59326ec26da379ace2ebc4411d690d80a7fbcf23",
|
||||
"sha256:1e12bdc622676ce47ae9abbf455c189e442afdde8818d9da983085df6312e7a1",
|
||||
"sha256:21915eb821a6b3d9d8eefdaf57d6c345b970ad722f856cd71739493ce003ad08",
|
||||
"sha256:288c6a76705dc54fba69fbcb59904ae4ad768b4c768839b8ca5fdadec6dd8cfd",
|
||||
"sha256:2bde6792f313f4e918caabc46532aa64aa27a0db05d75b20edfc5c6f46479de2",
|
||||
"sha256:32ca72bbc673adbcfecb935bb3fb1b74e663d10a4b241aaa2f5a75fe1d1f90aa",
|
||||
"sha256:356b3576ad078c89a6107caa9c50cc14e98e3a6c4874a37c3e0273e4baf33de8",
|
||||
"sha256:40b951f601af999a8bf2ce8c71e8aaa4e8c6f78ff8afae7b808aae2dc50d4c40",
|
||||
"sha256:572e1787d1460da79590bf44304abbc0a2da944ea64ec549188fa84d89bba7ab",
|
||||
"sha256:58df5c2a0e293bf665a51f8a100d3e9956febfbf1d9aaf8c0677cf70218910c6",
|
||||
"sha256:64e6175c2e53195278d7388c454e0b30997573f3f4bd63697f88d855f7a6a1fc",
|
||||
"sha256:7227b47e73dedaa513cdebb98469705ef0d66eb5a1250144468e9c3097d6b59b",
|
||||
"sha256:7418b6bfc7fe3331541b84bb2141c9baf1ec7132a7ecd9f375912eca810e714e",
|
||||
"sha256:7cbd7574ce8e138bda9df4efc6bf2ab8572c9aff640d8ecfece1b006b68da963",
|
||||
"sha256:7ff61ff178250f9bb3cd89752df0f1dd0e27316a8bd1465351652b1b4a4cdfd3",
|
||||
"sha256:833e1551925ed51e6b44c800e71e77dacd7e49181fdc9ac9a0bf3714d515785d",
|
||||
"sha256:8639cadfda96737427330a094476d4c7a56ac03de7265622fcf4cfe57c8ae18d",
|
||||
"sha256:8c5d5b35f789a030ebb95bff352f1d27a93d81069f2adb3182d99882e095cefe",
|
||||
"sha256:8c790abda465726cfb8bb08bd4ca9a5d0a7bd77c7ac1ca1b839ad823b948ea28",
|
||||
"sha256:8d2f1fb53a421b410751887eb4ff21386d119ef9cde3797bf5e7ed49fb51a3b3",
|
||||
"sha256:903bbd302a2378f984aef528f76d4c9b1748f318fe1294961c072bdc7f2ffa3e",
|
||||
"sha256:93f81b134a165cc17123626ab8da2e30c0455441d4ab5576eed73a64c025b25c",
|
||||
"sha256:95e69877983ea39b7303570fa6760f81a3eec23d0e3ab2021b7144b94d06202d",
|
||||
"sha256:9633b3034d3d901f0a46b7939f8c4d64427dfba6bbc5a36b1a67364cf148a1b0",
|
||||
"sha256:97e5306482182170ade15c4b0d8386ded995a07d7cc2ca8f27958d34d6736497",
|
||||
"sha256:9f3cba480d3deb69f6ee2c1825060177a22c7826431458c697df88e6aeb3caee",
|
||||
"sha256:aa5b467f15e78b82257319aebc78dd2915e4c1436c3c0d1ad6f53e47ba6e2713",
|
||||
"sha256:abb7a75ed8b968f3061327c433a0fbd17b729947b400747c334a9c29a9af6c58",
|
||||
"sha256:aec52725173bd3a7b56fe91bc56eccb26fbdff1386ef123abb63c84c5b43b63a",
|
||||
"sha256:b11548073a2213d950c3f671aa88e6f83cda6e2fb97a8b6317b1b5b33d850e06",
|
||||
"sha256:b1692f7d6bc45e3200844be0dba153612103db241691088626a33ff1f24a0d88",
|
||||
"sha256:b336501a05e13b616ef81ce329c0e09ac5ed8c732d9ba7e3e983fcc1a9e86965",
|
||||
"sha256:b8c008de9d0daba7b6666aa5bbfdc23dcd78cafc33997c9b7741ff6353bafb7f",
|
||||
"sha256:b92e29e58bef6d9cfd340c72b04d74c4b4e9f70c9fa7c78b674d1fec18896dc4",
|
||||
"sha256:be5f425ff1f5f4b3c1e33ad64ab994eed12fc284a6ea71c5243fd564502ecbe5",
|
||||
"sha256:dd0b1e9e891f69e7675ba5c92e28b90eaa045f6ab134ffe70b52e948aa175b3c",
|
||||
"sha256:e30f5ea4ae2346e62cedde8794a56858a67b878dd79f7df76a0767e356b1744a",
|
||||
"sha256:e6a36bb9474218c7a5b27ae476035497a6990e21d04c279884eb10d9b290f1b1",
|
||||
"sha256:e859fcb4cbe93504ea18008d1df98dee4f7766db66c435e4882ab35cf70cac43",
|
||||
"sha256:eb6ea6da4c787111adf40f697b4e58732ee0942b5d3bd8f435277643329ba627",
|
||||
"sha256:ec8c433b3ab0419100bd45b47c9c8551248a5aee30ca5e9d399a0b57ac04651b",
|
||||
"sha256:eff9d20417ff9dcb0d25e2defc2574d10b491bf2e693b4e491914738b7908168",
|
||||
"sha256:f0214eb2a23b85528310dad848ad2ac58e735612929c8072f6093f3585fd342d",
|
||||
"sha256:f276df9830dba7a333544bd41070e8175762a7ac20350786b322b714b0e654f5",
|
||||
"sha256:f3acda1924472472ddd60c29e5b9db0cec629fbe3c5c5accb74d6d6d14773478",
|
||||
"sha256:f70a9e237bb792c7cc7e44c531fd48f5897961701cdaa06cf22fc14965c496cf",
|
||||
"sha256:f9d29ca8a77117315101425ec7ec2a47a22ccf59f5593378fc4077ac5b754fce",
|
||||
"sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c",
|
||||
"sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b"
|
||||
],
|
||||
"markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))",
|
||||
"version": "==2.0.0a2"
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.1.2"
|
||||
},
|
||||
"gspread": {
|
||||
"hashes": [
|
||||
@@ -393,11 +416,11 @@
|
||||
},
|
||||
"loguru": {
|
||||
"hashes": [
|
||||
"sha256:b28e72ac7a98be3d28ad28570299a393dfcd32e5e3f6a353dec94675767b6319",
|
||||
"sha256:f8087ac396b5ee5f67c963b495d615ebbceac2796379599820e324419d53667c"
|
||||
"sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c",
|
||||
"sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.5.3"
|
||||
"version": "==0.6.0"
|
||||
},
|
||||
"lxml": {
|
||||
"hashes": [
|
||||
@@ -588,6 +611,68 @@
|
||||
"git": "https://github.com/bellingcat/polyphemus.git",
|
||||
"ref": "00a5123a3768a55ffe29f2c803a4181895f17890"
|
||||
},
|
||||
"psycopg2-binary": {
|
||||
"hashes": [
|
||||
"sha256:01310cf4cf26db9aea5158c217caa92d291f0500051a6469ac52166e1a16f5b7",
|
||||
"sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76",
|
||||
"sha256:090f3348c0ab2cceb6dfbe6bf721ef61262ddf518cd6cc6ecc7d334996d64efa",
|
||||
"sha256:0a29729145aaaf1ad8bafe663131890e2111f13416b60e460dae0a96af5905c9",
|
||||
"sha256:0c9d5450c566c80c396b7402895c4369a410cab5a82707b11aee1e624da7d004",
|
||||
"sha256:10bb90fb4d523a2aa67773d4ff2b833ec00857f5912bafcfd5f5414e45280fb1",
|
||||
"sha256:12b11322ea00ad8db8c46f18b7dfc47ae215e4df55b46c67a94b4effbaec7094",
|
||||
"sha256:152f09f57417b831418304c7f30d727dc83a12761627bb826951692cc6491e57",
|
||||
"sha256:15803fa813ea05bef089fa78835118b5434204f3a17cb9f1e5dbfd0b9deea5af",
|
||||
"sha256:15c4e4cfa45f5a60599d9cec5f46cd7b1b29d86a6390ec23e8eebaae84e64554",
|
||||
"sha256:183a517a3a63503f70f808b58bfbf962f23d73b6dccddae5aa56152ef2bcb232",
|
||||
"sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c",
|
||||
"sha256:1f6b813106a3abdf7b03640d36e24669234120c72e91d5cbaeb87c5f7c36c65b",
|
||||
"sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834",
|
||||
"sha256:2d872e3c9d5d075a2e104540965a1cf898b52274a5923936e5bfddb58c59c7c2",
|
||||
"sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71",
|
||||
"sha256:3303f8807f342641851578ee7ed1f3efc9802d00a6f83c101d21c608cb864460",
|
||||
"sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e",
|
||||
"sha256:3a79d622f5206d695d7824cbf609a4f5b88ea6d6dab5f7c147fc6d333a8787e4",
|
||||
"sha256:404224e5fef3b193f892abdbf8961ce20e0b6642886cfe1fe1923f41aaa75c9d",
|
||||
"sha256:46f0e0a6b5fa5851bbd9ab1bc805eef362d3a230fbdfbc209f4a236d0a7a990d",
|
||||
"sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9",
|
||||
"sha256:526ea0378246d9b080148f2d6681229f4b5964543c170dd10bf4faaab6e0d27f",
|
||||
"sha256:53293533fcbb94c202b7c800a12c873cfe24599656b341f56e71dd2b557be063",
|
||||
"sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478",
|
||||
"sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092",
|
||||
"sha256:63638d875be8c2784cfc952c9ac34e2b50e43f9f0a0660b65e2a87d656b3116c",
|
||||
"sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce",
|
||||
"sha256:68641a34023d306be959101b345732360fc2ea4938982309b786f7be1b43a4a1",
|
||||
"sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65",
|
||||
"sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e",
|
||||
"sha256:7af0dd86ddb2f8af5da57a976d27cd2cd15510518d582b478fbb2292428710b4",
|
||||
"sha256:7b1e9b80afca7b7a386ef087db614faebbf8839b7f4db5eb107d0f1a53225029",
|
||||
"sha256:874a52ecab70af13e899f7847b3e074eeb16ebac5615665db33bce8a1009cf33",
|
||||
"sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39",
|
||||
"sha256:8b344adbb9a862de0c635f4f0425b7958bf5a4b927c8594e6e8d261775796d53",
|
||||
"sha256:8fc53f9af09426a61db9ba357865c77f26076d48669f2e1bb24d85a22fb52307",
|
||||
"sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42",
|
||||
"sha256:93cd1967a18aa0edd4b95b1dfd554cf15af657cb606280996d393dadc88c3c35",
|
||||
"sha256:99485cab9ba0fa9b84f1f9e1fef106f44a46ef6afdeec8885e0b88d0772b49e8",
|
||||
"sha256:9d29409b625a143649d03d0fd7b57e4b92e0ecad9726ba682244b73be91d2fdb",
|
||||
"sha256:a29b3ca4ec9defec6d42bf5feb36bb5817ba3c0230dd83b4edf4bf02684cd0ae",
|
||||
"sha256:a9e1f75f96ea388fbcef36c70640c4efbe4650658f3d6a2967b4cc70e907352e",
|
||||
"sha256:accfe7e982411da3178ec690baaceaad3c278652998b2c45828aaac66cd8285f",
|
||||
"sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba",
|
||||
"sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24",
|
||||
"sha256:b1c8068513f5b158cf7e29c43a77eb34b407db29aca749d3eb9293ee0d3103ca",
|
||||
"sha256:bda845b664bb6c91446ca9609fc69f7db6c334ec5e4adc87571c34e4f47b7ddb",
|
||||
"sha256:c381bda330ddf2fccbafab789d83ebc6c53db126e4383e73794c74eedce855ef",
|
||||
"sha256:c3ae8e75eb7160851e59adc77b3a19a976e50622e44fd4fd47b8b18208189d42",
|
||||
"sha256:d1c1b569ecafe3a69380a94e6ae09a4789bbb23666f3d3a08d06bbd2451f5ef1",
|
||||
"sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667",
|
||||
"sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272",
|
||||
"sha256:e3699852e22aa68c10de06524a3721ade969abf382da95884e6a10ff798f9281",
|
||||
"sha256:e847774f8ffd5b398a75bc1c18fbb56564cda3d629fe68fd81971fece2d3c67e",
|
||||
"sha256:ffb7a888a047696e7f8240d649b43fb3644f14f0ee229077e7f6b9f9081635bd"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.9.3"
|
||||
},
|
||||
"pyaes": {
|
||||
"hashes": [
|
||||
"sha256:02c1b1405c38d3c370b085fb952dd8bea3fadcee6411ad99f312cc129c536d8f"
|
||||
@@ -711,6 +796,7 @@
|
||||
"sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7",
|
||||
"sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"
|
||||
],
|
||||
"markers": "python_version < '3.9'",
|
||||
"version": "==2022.1"
|
||||
},
|
||||
"pytz-deprecation-shim": {
|
||||
@@ -725,6 +811,7 @@
|
||||
"hashes": [
|
||||
"sha256:af8a9b64b821529aca09ebaf6d8d279100d766f19e90b5059ac6a718ca6dee42"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.2.1"
|
||||
},
|
||||
"regex": {
|
||||
@@ -808,6 +895,9 @@
|
||||
"version": "==2022.3.2"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [
|
||||
"socks"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
@@ -828,7 +918,7 @@
|
||||
"sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17",
|
||||
"sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"
|
||||
],
|
||||
"markers": "python_version >= '3.6' and python_version < '4'",
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==4.8"
|
||||
},
|
||||
"s3transfer": {
|
||||
@@ -849,7 +939,7 @@
|
||||
},
|
||||
"snscrape": {
|
||||
"git": "https://github.com/bellingcat/snscrape.git",
|
||||
"ref": "fb8d73ac95011b7ad848a6048d3eed1880e80f21"
|
||||
"ref": "d32c9add8a3691c81c9091dc1a7d079e9871379f"
|
||||
},
|
||||
"soupsieve": {
|
||||
"hashes": [
|
||||
@@ -913,7 +1003,7 @@
|
||||
"sha256:4230a49119a416c88cc47d0d2d32d5d90f1a282d5e497d49801950704e49863d",
|
||||
"sha256:6461b009d6792008d0000e1b0c7ca50195ec78c0e808a3a6b668a56a3236c3a5"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"index": "pypi",
|
||||
"version": "==4.63.1"
|
||||
},
|
||||
"tzdata": {
|
||||
@@ -1073,11 +1163,11 @@
|
||||
},
|
||||
"click": {
|
||||
"hashes": [
|
||||
"sha256:19a4baa64da924c5e0cd889aba8e947f280309f1a2ce0947a3e3a7bcb7cc72d6",
|
||||
"sha256:977c213473c7665d3aa092b41ff12063227751c41d7b17165013e10069cc5cd2"
|
||||
"sha256:5e0d195c2067da3136efb897449ec1e9e6c98282fbf30d7f9e164af9be901a6b",
|
||||
"sha256:7ab900e38149c9872376e8f9b5986ddcaf68c0f413cf73678a0bca5547e6f976"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==8.1.0"
|
||||
"version": "==8.1.1"
|
||||
},
|
||||
"coverage": {
|
||||
"extras": [
|
||||
@@ -1321,9 +1411,13 @@
|
||||
"sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7",
|
||||
"sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"
|
||||
],
|
||||
"markers": "python_version < '3.9'",
|
||||
"version": "==2022.1"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [
|
||||
"socks"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
|
||||
135
app.py
Normal file
135
app.py
Normal file
@@ -0,0 +1,135 @@
|
||||
import argparse
|
||||
from loguru import logger
|
||||
import gspread
|
||||
from sqlalchemy import create_engine, func
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import os
|
||||
import time
|
||||
|
||||
from cisticola.base import Channel, RawChannelInfo, mapper_registry
|
||||
from cisticola.scraper import (
|
||||
ScraperController,
|
||||
BitchuteScraper,
|
||||
GabScraper,
|
||||
GettrScraper,
|
||||
OdyseeScraper,
|
||||
RumbleScraper,
|
||||
TelegramSnscrapeScraper,
|
||||
TelegramTelethonScraper,
|
||||
TwitterScraper)
|
||||
|
||||
def sync_channels(args):
|
||||
logger.info("Synchronizing channels")
|
||||
|
||||
session = get_db_session()
|
||||
|
||||
gc = gspread.service_account(filename='service_account.json')
|
||||
|
||||
# Open a sheet from a spreadsheet in one go
|
||||
wks = gc.open_by_url(args.gsheet).worksheet("channels")
|
||||
channels = wks.get_all_records()
|
||||
row = 2
|
||||
|
||||
for c in channels:
|
||||
# only adding channels, so skip everything with an ID
|
||||
if c['id'] == '':
|
||||
del c['id']
|
||||
del c['followers']
|
||||
|
||||
if c['public'] == '': c['public'] = False
|
||||
if c['chat'] == '': c['chat'] = False
|
||||
|
||||
for k in c.keys():
|
||||
if c[k] == 'TRUE' or c[k] == 'yes': c[k] = True
|
||||
if c[k] == 'FALSE' or c[k] == 'no': c[k] = False
|
||||
|
||||
if c[k] == '': c[k] = None
|
||||
|
||||
# check to see if this already exists,
|
||||
platform_id = None
|
||||
if c['platform_id'] != '':
|
||||
platform_id = c['platform_id']
|
||||
|
||||
channel = session.query(Channel).filter_by(platform_id=platform_id, platform=c['platform'], url=c['url']).first()
|
||||
|
||||
if not channel:
|
||||
channel = Channel(**c, source='researcher')
|
||||
logger.debug(f"{channel} does not exist, adding")
|
||||
session.add(channel)
|
||||
session.flush()
|
||||
session.commit()
|
||||
|
||||
wks.update_cell(row, 1, channel.id)
|
||||
time.sleep(1)
|
||||
|
||||
row += 1
|
||||
|
||||
session.commit()
|
||||
|
||||
def get_db_session():
|
||||
engine = create_engine(os.environ['DB'])
|
||||
|
||||
session_generator = sessionmaker()
|
||||
session_generator.configure(bind=engine)
|
||||
session = session_generator()
|
||||
|
||||
return session
|
||||
|
||||
def get_scraper_controller():
|
||||
engine = create_engine(os.environ['DB'])
|
||||
|
||||
controller = ScraperController()
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
scrapers = [
|
||||
TelegramTelethonScraper(),
|
||||
TwitterScraper()]
|
||||
|
||||
controller.register_scrapers(scrapers)
|
||||
|
||||
return controller
|
||||
|
||||
def scrape_channels(args):
|
||||
logger.info(f"Scraping channels, media: {args.media}")
|
||||
|
||||
controller = get_scraper_controller()
|
||||
controller.scrape_all_channels(archive_media = args.media)
|
||||
|
||||
def scrape_channel_info(args):
|
||||
logger.info(f"Scraping channel info")
|
||||
|
||||
controller = get_scraper_controller()
|
||||
controller.scrape_all_channel_info()
|
||||
|
||||
def archive_media(args):
|
||||
logger.info(f"Archiving unarchived media")
|
||||
|
||||
controller = get_scraper_controller()
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
def init_db():
|
||||
engine = create_engine(os.environ['DB'])
|
||||
mapper_registry.metadata.create_all(bind=engine)
|
||||
|
||||
if __name__ == '__main__':
|
||||
logger.add("./test.log", level="TRACE")
|
||||
|
||||
parser = argparse.ArgumentParser(description = 'Cisticola command line tools')
|
||||
parser.add_argument('command', type=str, help='Command to run: "sync-channels", "scrape-channels", or "archive-media"')
|
||||
parser.add_argument('--gsheet', type=str, help='[sync-channels] URL of Google Sheet to synchronize')
|
||||
parser.add_argument('--media', action='store_true', help='[scrape-channels] Add this flag to media')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.command == 'init-db':
|
||||
init_db()
|
||||
elif args.command == 'sync-channels':
|
||||
sync_channels(args)
|
||||
elif args.command == 'scrape-channels':
|
||||
scrape_channels(args)
|
||||
elif args.command == 'archive-media':
|
||||
archive_media(args)
|
||||
elif args.command == 'channel-info':
|
||||
scrape_channel_info(args)
|
||||
else:
|
||||
logger.error(f"Unrecognized command {args.command}")
|
||||
@@ -34,7 +34,7 @@ class ScraperResult:
|
||||
date: datetime
|
||||
|
||||
#: JSON dump of dict that contains all data scraped for the post.
|
||||
raw_data: str
|
||||
raw_posts: str
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||
date_archived: datetime
|
||||
@@ -44,7 +44,7 @@ class ScraperResult:
|
||||
|
||||
#: Has the media in this post been archived?
|
||||
media_archived: bool
|
||||
|
||||
|
||||
@dataclass
|
||||
class Channel:
|
||||
"""Information about a specific channel to be scraped.
|
||||
@@ -89,11 +89,31 @@ class Channel:
|
||||
def hydrate(self):
|
||||
pass
|
||||
|
||||
@dataclass
|
||||
class RawChannelInfo:
|
||||
"""A minimally processed result from a scraper
|
||||
"""
|
||||
|
||||
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||
scraper: str
|
||||
|
||||
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
|
||||
platform: str
|
||||
|
||||
#: Foreign key of channel ID that this was scraped from
|
||||
channel: int
|
||||
|
||||
#: JSON dump of dict that contains all data scraped for the post.
|
||||
raw_data: str
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||
date_archived: datetime
|
||||
|
||||
@dataclass
|
||||
class Post:
|
||||
"""An object with fields for columns in the analysis table"""
|
||||
|
||||
#: ID number of the scraped post in the ``raw_data`` table
|
||||
#: ID number of the scraped post in the ``raw_posts`` table
|
||||
raw_id: int
|
||||
|
||||
#: Platform specific post ID
|
||||
@@ -144,7 +164,7 @@ class Media:
|
||||
"""Base class for organizing information about a media file.
|
||||
"""
|
||||
|
||||
#: ID number of the media's corresponding scraped post in the ``raw_data`` table.
|
||||
#: ID number of the media's corresponding scraped post in the ``raw_posts`` table.
|
||||
raw_id: int
|
||||
|
||||
#: ID number of the media's corresponging scraped post in the ``analysis`` table.
|
||||
@@ -221,7 +241,7 @@ class Video(Media):
|
||||
|
||||
mapper_registry = registry()
|
||||
|
||||
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
raw_posts_table = Table('raw_posts', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('scraper', String),
|
||||
@@ -229,15 +249,23 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
Column('channel', Integer, ForeignKey('channels.id')),
|
||||
Column('platform_id', String),
|
||||
Column('date', DateTime),
|
||||
Column('raw_data', String),
|
||||
Column('raw_posts', String),
|
||||
Column('date_archived', DateTime),
|
||||
Column('archived_urls', JSON),
|
||||
Column('media_archived', Boolean))
|
||||
|
||||
raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True),
|
||||
Column('scraper', String),
|
||||
Column('platform', String),
|
||||
Column('channel', Integer, ForeignKey('channels.id')),
|
||||
Column('raw_data', String),
|
||||
Column('date_archived', DateTime))
|
||||
|
||||
channel_table = Table('channels', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True, autoincrement=True),
|
||||
Column('name', String),
|
||||
Column('platform_id', Integer),
|
||||
Column('platform_id', String),
|
||||
Column('category', String),
|
||||
Column('platform', String),
|
||||
Column('url', String),
|
||||
@@ -253,7 +281,7 @@ channel_table = Table('channels', mapper_registry.metadata,
|
||||
post_table = Table('posts', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
Column('raw_id', Integer, ForeignKey('raw_posts.id')),
|
||||
Column('platform_id', Integer),
|
||||
Column('scraper', String),
|
||||
Column('transformer', String),
|
||||
@@ -273,7 +301,7 @@ media_table = Table('media', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('type', String),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
Column('raw_id', Integer, ForeignKey('raw_posts.id')),
|
||||
Column('post', Integer, ForeignKey('posts.id')),
|
||||
Column('url', String),
|
||||
Column('original_url', String),
|
||||
@@ -282,7 +310,8 @@ media_table = Table('media', mapper_registry.metadata,
|
||||
|
||||
mapper_registry.map_imperatively(Post, post_table)
|
||||
mapper_registry.map_imperatively(Channel, channel_table)
|
||||
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
|
||||
mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
|
||||
mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
|
||||
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
|
||||
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
|
||||
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
|
||||
@@ -303,6 +303,9 @@ class ScraperController:
|
||||
"""
|
||||
self.scrapers.extend(scraper)
|
||||
|
||||
def remove_all_scrapers(self):
|
||||
self.scrapers = []
|
||||
|
||||
def scrape_all_channels(self, archive_media: bool = True):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
@@ -313,6 +316,17 @@ class ScraperController:
|
||||
channels = session.query(Channel).where(Channel.source=='researcher').all()
|
||||
|
||||
return self.scrape_channels(channels, archive_media=archive_media)
|
||||
|
||||
def scrape_all_channel_info(self):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
|
||||
channels = session.query(Channel).where(Channel.source=='researcher').all()
|
||||
|
||||
return self.scrape_channel_info(channels)
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
|
||||
@@ -336,6 +350,7 @@ class ScraperController:
|
||||
|
||||
for scraper in self.scrapers:
|
||||
if scraper.can_handle(channel):
|
||||
logger.debug(f"{scraper} is handling {channel}")
|
||||
handled = True
|
||||
added = 0
|
||||
|
||||
@@ -382,7 +397,7 @@ class ScraperController:
|
||||
for scraper in self.scrapers:
|
||||
if scraper.__version__ == post.scraper:
|
||||
handled = True
|
||||
logger.info(f"{scraper} is archiving media for {post}")
|
||||
logger.debug(f"{scraper} is archiving media for ID {post.id}")
|
||||
post = scraper.archive_files(post)
|
||||
|
||||
if post:
|
||||
@@ -396,6 +411,48 @@ class ScraperController:
|
||||
|
||||
session.commit()
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def scrape_channel_info(self, channels: List[Channel]):
|
||||
"""Scrape channel info for specified channels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
channels: list<Channel>
|
||||
List of Channel instances to be scraped
|
||||
archive_media: bool
|
||||
If ``True``, any media files (images, video, etc.) from posts are archived.
|
||||
If ``False``, media files are not archived.
|
||||
"""
|
||||
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
for channel in channels:
|
||||
handled = False
|
||||
|
||||
for scraper in self.scrapers:
|
||||
if scraper.can_handle(channel):
|
||||
logger.debug(f"{scraper} is getting channel info for {channel}")
|
||||
handled = True
|
||||
|
||||
# get most recent post
|
||||
session = self.session()
|
||||
|
||||
try:
|
||||
info = scraper.get_profile(channel)
|
||||
session.add(info)
|
||||
|
||||
session.commit()
|
||||
logger.info(
|
||||
f"{scraper} found {info}")
|
||||
break
|
||||
except ChannelDoesNotExistError:
|
||||
logger.warning(f"ChannelDoesNotExist {channel}")
|
||||
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
"""Connect the specified SQLAlchemy engine to the controller.
|
||||
"""
|
||||
|
||||
@@ -9,7 +9,7 @@ from typing import Generator
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class BitchuteScraper(Scraper):
|
||||
@@ -57,7 +57,7 @@ class BitchuteScraper(Scraper):
|
||||
platform_id=post['id'],
|
||||
date=datetime.fromtimestamp(post['timestamp']),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post),
|
||||
raw_posts=json.dumps(post),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -65,7 +65,7 @@ class BitchuteScraper(Scraper):
|
||||
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
base_url = channel.url
|
||||
|
||||
@@ -106,8 +106,12 @@ class BitchuteScraper(Scraper):
|
||||
'subscribers': counts['subscriber_count'],
|
||||
'views': int(counts['about_view_count'].split(' ')[0])}
|
||||
|
||||
return profile
|
||||
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def strip_tags(html, convert_newlines=True):
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
|
||||
from gabber.client import Client, GAB_API_BASE_URL
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class GabScraper(Scraper):
|
||||
@@ -80,7 +80,7 @@ class GabScraper(Scraper):
|
||||
platform_id=post['id'],
|
||||
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post),
|
||||
raw_posts=json.dumps(post),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -88,7 +88,7 @@ class GabScraper(Scraper):
|
||||
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
client = Client(
|
||||
username = os.environ['GAB_USER'],
|
||||
@@ -106,4 +106,8 @@ class GabScraper(Scraper):
|
||||
|
||||
profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
|
||||
|
||||
return profile
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -5,7 +5,7 @@ from urllib.parse import urlparse
|
||||
|
||||
from gogettr import PublicClient
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class GettrScraper(Scraper):
|
||||
@@ -58,7 +58,7 @@ class GettrScraper(Scraper):
|
||||
platform_id=post['_id'],
|
||||
date=datetime.fromtimestamp(post['cdate']/1000.),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post),
|
||||
raw_posts=json.dumps(post),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -71,9 +71,13 @@ class GettrScraper(Scraper):
|
||||
key = urlparse(url).path.split('/')[-2] + ext
|
||||
return key
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
client = client = PublicClient()
|
||||
username = self.get_username_from_url(channel.url)
|
||||
profile = client.user_info(username)
|
||||
|
||||
return profile
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -8,7 +8,7 @@ from pathlib import Path
|
||||
from loguru import logger
|
||||
import instaloader
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
BASE_URL = 'https://www.instagram.com/'
|
||||
@@ -79,7 +79,7 @@ class InstagramScraper(Scraper):
|
||||
platform_id=post.mediaid,
|
||||
date=post.date_utc,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post._asdict(), default=str),
|
||||
raw_posts=json.dumps(post._asdict(), default=str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -96,7 +96,7 @@ class InstagramScraper(Scraper):
|
||||
platform_id=post.mediaid,
|
||||
date=comment.created_at_utc,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(comment_dict, default=str),
|
||||
raw_posts=json.dumps(comment_dict, default=str),
|
||||
archived_urls={},
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -104,7 +104,7 @@ class InstagramScraper(Scraper):
|
||||
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
@@ -125,4 +125,8 @@ class InstagramScraper(Scraper):
|
||||
profile['followers'] = user_profile.followers
|
||||
profile['followees'] = user_profile.followees
|
||||
|
||||
return profile
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -8,7 +8,7 @@ from loguru import logger
|
||||
|
||||
from polyphemus.base import OdyseeChannel
|
||||
from polyphemus.api import get_auth_token
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class OdyseeScraper(Scraper):
|
||||
@@ -60,7 +60,7 @@ class OdyseeScraper(Scraper):
|
||||
platform_id=video.info['claim_id'],
|
||||
date=datetime.fromtimestamp(video.info['created']),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(video.info),
|
||||
raw_posts=json.dumps(video.info),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -73,7 +73,7 @@ class OdyseeScraper(Scraper):
|
||||
platform_id=comment.info['claim_id'],
|
||||
date=datetime.fromtimestamp(comment.info['created']),
|
||||
date_archived=datetime.now(),
|
||||
raw_data=json.dumps(comment.info),
|
||||
raw_posts=json.dumps(comment.info),
|
||||
archived_urls={},
|
||||
media_archived=True)
|
||||
|
||||
@@ -87,10 +87,14 @@ class OdyseeScraper(Scraper):
|
||||
|
||||
return f'{key}.{ext}'
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
|
||||
profile = odysee_channel.info
|
||||
|
||||
return profile
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
@@ -5,7 +5,7 @@ from urllib.parse import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper import Scraper, make_request
|
||||
|
||||
BASE_URL = 'https://rumble.com'
|
||||
@@ -39,7 +39,7 @@ class RumbleScraper(Scraper):
|
||||
platform_id=post['media_url'].split('/')[-2],
|
||||
date=post['datetime'].replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post, default = str),
|
||||
raw_posts=json.dumps(post, default = str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -52,11 +52,15 @@ class RumbleScraper(Scraper):
|
||||
if channel.platform == "Rumble" and channel.url is not None:
|
||||
return True
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
profile = get_channel_profile(url = channel.url)
|
||||
|
||||
return profile
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@@ -128,6 +132,7 @@ def get_channel_profile(url):
|
||||
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
|
||||
'cover': cover_soup.get('src') if cover_soup else None,
|
||||
'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
|
||||
|
||||
return profile
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
@@ -1,10 +1,10 @@
|
||||
from typing import Generator
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import json
|
||||
import snscrape.modules
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class TelegramSnscrapeScraper(Scraper):
|
||||
@@ -49,15 +49,20 @@ class TelegramSnscrapeScraper(Scraper):
|
||||
platform_id=post.url,
|
||||
date=post.date,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=post.json(),
|
||||
raw_posts=post.json(),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media
|
||||
)
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
scr = snscrape.modules.telegram.TelegramChannelScraper(
|
||||
channel.screenname)
|
||||
|
||||
profile = scr._get_entity().__dict__
|
||||
return profile
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -11,7 +11,7 @@ from telethon.sync import TelegramClient
|
||||
from telethon.tl.functions.channels import GetFullChannelRequest
|
||||
from telethon.tl import types
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
|
||||
@@ -44,7 +44,7 @@ class TelegramTelethonScraper(Scraper):
|
||||
key = list(result.archived_urls.keys())[0]
|
||||
|
||||
if result.archived_urls[key] is None:
|
||||
raw = json.loads(result.raw_data)
|
||||
raw = json.loads(result.raw_posts)
|
||||
|
||||
message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
|
||||
|
||||
@@ -66,13 +66,10 @@ class TelegramTelethonScraper(Scraper):
|
||||
return result
|
||||
|
||||
def archive_post_media(self, post : types.Message, client : TelegramClient = None):
|
||||
logger.debug(f"Archiving post {post}")
|
||||
|
||||
if post.media is None:
|
||||
logger.debug("No media for post")
|
||||
return None, None
|
||||
|
||||
logger.debug(f"Archiving media {post.media}")
|
||||
|
||||
if client is None:
|
||||
api_id = os.environ['TELEGRAM_API_ID']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
||||
@@ -81,6 +78,11 @@ class TelegramTelethonScraper(Scraper):
|
||||
with TelegramClient(phone, api_id, api_hash) as client:
|
||||
return self.archive_post_media(post, client=client)
|
||||
|
||||
if type(post.media) == types.MessageMediaDocument:
|
||||
logger.debug(f"Archiving {type(post.media)} with size {post.media.document.size/(1024*1024)} MB")
|
||||
else:
|
||||
logger.debug(f"Archiving {type(post.media)}")
|
||||
|
||||
key = f'{post.peer_id.channel_id}_{post.id}'
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
@@ -88,6 +90,10 @@ class TelegramTelethonScraper(Scraper):
|
||||
|
||||
client.download_media(post.media, output_file)
|
||||
|
||||
if len(os.listdir(temp_dir)) == 0:
|
||||
logger.warning(f"No file present. Could not archive {post.media}")
|
||||
return None, None
|
||||
|
||||
output_file_with_ext = os.listdir(temp_dir)[0]
|
||||
filename = Path(temp_dir, output_file_with_ext)
|
||||
|
||||
@@ -96,11 +102,13 @@ class TelegramTelethonScraper(Scraper):
|
||||
return (blob, output_file_with_ext)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Telegram" and channel.public and not channel.chat:
|
||||
if channel.platform == "Telegram" and channel.public:
|
||||
return True
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
username = self.get_username_from_url(channel.url)
|
||||
username = channel.screenname
|
||||
if username is None:
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
api_id = os.environ['TELEGRAM_API_ID']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
||||
@@ -110,14 +118,13 @@ class TelegramTelethonScraper(Scraper):
|
||||
for post in client.iter_messages(username):
|
||||
post_url = f'{channel.url}/{post.id}'
|
||||
|
||||
logger.info(f"Archiving post {post_url} from {post.date}")
|
||||
logger.trace(f"Archiving post {post_url} from {post.date}")
|
||||
|
||||
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
|
||||
break
|
||||
|
||||
archived_urls = {}
|
||||
logger.info(f"Archiving post {post_url}")
|
||||
|
||||
if post.media is not None:
|
||||
archived_urls[post_url] = None
|
||||
@@ -136,13 +143,14 @@ class TelegramTelethonScraper(Scraper):
|
||||
platform_id=post_url,
|
||||
date=post.date.replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post.to_dict(), default=str),
|
||||
raw_posts=json.dumps(post.to_dict(), default=str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
username = channel.screenname
|
||||
if username is None:
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
api_id = os.environ['TELEGRAM_API_ID']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
||||
@@ -150,6 +158,10 @@ class TelegramTelethonScraper(Scraper):
|
||||
|
||||
with TelegramClient(phone, api_id, api_hash) as client:
|
||||
full_channel = client(GetFullChannelRequest(channel = username))
|
||||
profile = full_channel.__dict__
|
||||
profile = full_channel.to_dict()
|
||||
|
||||
return profile
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile, default=str),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
from datetime import datetime, timezone
|
||||
from typing import Generator
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo
|
||||
from loguru import logger
|
||||
import json
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper, ChannelDoesNotExistError
|
||||
|
||||
class TwitterScraper(Scraper):
|
||||
@@ -13,7 +13,12 @@ class TwitterScraper(Scraper):
|
||||
__version__ = "TwitterScraper 0.0.1"
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
scraper = TwitterProfileScraper(channel.platform_id)
|
||||
if channel.platform_id:
|
||||
identifier = channel.platform_id
|
||||
else:
|
||||
identifier = channel.screenname
|
||||
|
||||
scraper = TwitterProfileScraper(identifier)
|
||||
|
||||
first = True
|
||||
|
||||
@@ -32,10 +37,10 @@ class TwitterScraper(Scraper):
|
||||
if tweet.media:
|
||||
media_list += tweet.media
|
||||
|
||||
if tweet.retweetedTweet and tweet.retweetedTweet.media:
|
||||
if tweet.retweetedTweet and hasattr(tweet.retweetedTweet, 'media') and tweet.retweetedTweet.media:
|
||||
media_list += tweet.retweetedTweet.media
|
||||
|
||||
if tweet.quotedTweet and tweet.quotedTweet.media:
|
||||
if tweet.quotedTweet and hasattr(tweet.quotedTweet, 'media') and tweet.quotedTweet.media:
|
||||
media_list += tweet.quotedTweet.media
|
||||
|
||||
for media in media_list:
|
||||
@@ -66,12 +71,12 @@ class TwitterScraper(Scraper):
|
||||
platform_id=tweet.id,
|
||||
date=tweet.date,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=tweet.json(),
|
||||
raw_posts=tweet.json(),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Twitter" and channel.platform_id:
|
||||
if channel.platform == "Twitter" and (channel.platform_id or channel.screenname):
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
@@ -91,7 +96,7 @@ class TwitterScraper(Scraper):
|
||||
key = parsed_url.path.split('/')[-1] + ext
|
||||
return key
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
scraper = TwitterUserScraper(channel.screenname)
|
||||
entity = scraper._get_entity()
|
||||
@@ -99,4 +104,8 @@ class TwitterScraper(Scraper):
|
||||
if entity is None:
|
||||
raise ChannelDoesNotExistError(channel.url)
|
||||
else:
|
||||
return entity.__dict__
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(entity.__dict__, default=str),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
from datetime import datetime, timezone
|
||||
from typing import Generator
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from snscrape.modules.vkontakte import VKontakteUserScraper
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class VkontakteScraper(Scraper):
|
||||
@@ -62,7 +61,7 @@ class VkontakteScraper(Scraper):
|
||||
platform_id=post.url.split('/')[-1],
|
||||
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=post.json(),
|
||||
raw_posts=post.json(),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -80,10 +79,15 @@ class VkontakteScraper(Scraper):
|
||||
|
||||
return key
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = VKontakteUserScraper(username)
|
||||
|
||||
profile = scraper._get_entity().__dict__
|
||||
return profile
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -2,10 +2,9 @@ from datetime import datetime, timezone
|
||||
import json
|
||||
from typing import Generator
|
||||
import tempfile
|
||||
|
||||
import yt_dlp
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper import Scraper
|
||||
|
||||
class YoutubeScraper(Scraper):
|
||||
@@ -71,7 +70,7 @@ class YoutubeScraper(Scraper):
|
||||
platform_id=video_id,
|
||||
date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(video, default = str),
|
||||
raw_posts=json.dumps(video, default = str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
|
||||
@@ -79,8 +78,7 @@ class YoutubeScraper(Scraper):
|
||||
if channel.platform == "Youtube" and channel.url:
|
||||
return True
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
ydl_opts = {}
|
||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||
|
||||
@@ -89,7 +87,12 @@ class YoutubeScraper(Scraper):
|
||||
meta = ydl.extract_info(
|
||||
channel.url,
|
||||
process=False)
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(meta),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
except yt_dlp.utils.DownloadError as e:
|
||||
raise e
|
||||
|
||||
return meta
|
||||
@@ -20,7 +20,7 @@ class BitchuteTransformer(Transformer):
|
||||
return False
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
raw = json.loads(data.raw_posts)
|
||||
|
||||
orig = raw['video_url']
|
||||
new = data.archived_urls[orig]
|
||||
@@ -30,7 +30,7 @@ class BitchuteTransformer(Transformer):
|
||||
yield m
|
||||
|
||||
def transform(self, data: ScraperResult) -> Post:
|
||||
raw = json.loads(data.raw_data)
|
||||
raw = json.loads(data.raw_posts)
|
||||
|
||||
soup = BeautifulSoup(raw['body'], features = 'html.parser')
|
||||
content = soup.find_all('p')[-1].text
|
||||
|
||||
@@ -47,7 +47,7 @@ class TwitterTransformer(Transformer):
|
||||
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
raw = json.loads(data.raw_posts)
|
||||
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
|
||||
74
test.py
74
test.py
@@ -1,74 +0,0 @@
|
||||
from sqlalchemy import create_engine
|
||||
from loguru import logger
|
||||
import gspread
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from cisticola.base import Channel, Post, ScraperResult, mapper_registry
|
||||
from cisticola.scraper import (
|
||||
ScraperController,
|
||||
BitchuteScraper,
|
||||
GabScraper,
|
||||
GettrScraper,
|
||||
OdyseeScraper,
|
||||
RumbleScraper,
|
||||
TelegramSnscrapeScraper,
|
||||
TelegramTelethonScraper,
|
||||
TwitterScraper)
|
||||
from cisticola.transformer import ETLController
|
||||
from cisticola.transformer.twitter import TwitterTransformer
|
||||
|
||||
logger.add("../test.log")
|
||||
|
||||
controller = ScraperController()
|
||||
|
||||
scrapers = [
|
||||
BitchuteScraper(),
|
||||
GabScraper(),
|
||||
GettrScraper(),
|
||||
OdyseeScraper(),
|
||||
RumbleScraper(),
|
||||
TelegramTelethonScraper(),
|
||||
TwitterScraper()]
|
||||
|
||||
controller.register_scrapers(scrapers)
|
||||
|
||||
engine = create_engine('sqlite:///test.db')
|
||||
mapper_registry.metadata.create_all(bind=engine)
|
||||
session_generator = sessionmaker()
|
||||
session_generator.configure(bind=engine)
|
||||
session = session_generator()
|
||||
|
||||
gc = gspread.service_account(filename='service_account.json')
|
||||
|
||||
# Open a sheet from a spreadsheet in one go
|
||||
wks = gc.open_by_url("https://docs.google.com/spreadsheets/d/1k5VgqREoA3v1r7bkVq7TOTRDtdYqTMWkQnsZpRbntpw/edit#gid=0")
|
||||
channels = wks.worksheet("channels").get_all_records()
|
||||
|
||||
for c in channels:
|
||||
del c['followers']
|
||||
|
||||
for k in c.keys():
|
||||
if c[k] == 'TRUE' or c[k] == 'yes': c[k] = True
|
||||
if c[k] == 'FALSE' or c[k] == 'no': c[k] = False
|
||||
|
||||
# check to see if this already exists,
|
||||
channel = session.query(Channel).filter_by(platform_id=c['platform_id'], platform=c['platform']).first()
|
||||
|
||||
if not channel:
|
||||
channel = Channel(**c, source='researcher')
|
||||
session.add(channel)
|
||||
|
||||
session.commit()
|
||||
|
||||
controller.connect_to_db(engine)
|
||||
controller.scrape_all_channels(archive_media = False)
|
||||
|
||||
controller.archive_unarchived_media()
|
||||
|
||||
# transformer = TwitterTransformer()
|
||||
|
||||
# etl_controller = ETLController()
|
||||
# etl_controller.register_transformer(transformer)
|
||||
# etl_controller.connect_to_db(engine)
|
||||
# etl_controller.transform_all_untransformed()
|
||||
@@ -4,6 +4,7 @@ from cisticola.base import Channel
|
||||
from cisticola.scraper import TelegramTelethonScraper
|
||||
|
||||
def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
|
||||
controller.remove_all_scrapers()
|
||||
|
||||
channels = [Channel(**channel_kwargs['telegram'])]
|
||||
controller.register_scraper(scraper = TelegramTelethonScraper())
|
||||
@@ -13,6 +14,7 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
|
||||
def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
|
||||
|
||||
controller.reset_db()
|
||||
controller.remove_all_scrapers()
|
||||
|
||||
channels = [Channel(**channel_kwargs['telegram'])]
|
||||
controller.register_scraper(scraper = TelegramTelethonScraper())
|
||||
|
||||
Reference in New Issue
Block a user