From ffe1c425a01cbc8d087da2ed2b87077df1acd74f Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 27 Jun 2022 01:07:55 +0200 Subject: [PATCH 1/4] new archiver, new hack, ready --- Pipfile | 1 + Pipfile.lock | 224 +++++++++++++++++++----------- README.md | 1 + archivers/__init__.py | 3 +- archivers/base_archiver.py | 6 +- archivers/telethon_archiver.py | 3 +- archivers/twitter_api_archiver.py | 73 ++++++++++ archivers/twitter_archiver.py | 69 ++++++--- auto_archive.py | 3 +- configs/__init__.py | 1 + configs/config.py | 20 ++- configs/twitter_api_config.py | 11 ++ example.config.yaml | 17 ++- 13 files changed, 325 insertions(+), 107 deletions(-) create mode 100644 archivers/twitter_api_archiver.py create mode 100644 configs/twitter_api_config.py diff --git a/Pipfile b/Pipfile index a14995b..88fad6a 100644 --- a/Pipfile +++ b/Pipfile @@ -24,6 +24,7 @@ python-slugify = "*" pyyaml = "*" dateparser = "*" vk-url-scraper = "*" +python-twitter-v2 = "*" [requires] python_version = "3.9" diff --git a/Pipfile.lock b/Pipfile.lock index 77fc8f0..691fcfa 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "eacd9633c33d4d526d7737fc6bf83ab713205f28f819530f549378fbd14da3d8" + "sha256": "1ed953d08e31d891de0f887e520f12025d109a20718b27dd8f9b361f73c95651" }, "pipfile-spec": 6, "requires": { @@ -29,7 +29,7 @@ "sha256:01c7bf666359b4967d2cda0000cc2e4af16a0ae098cbffcb8472fb9e8ad6585b", "sha256:6ebb3d106c12920aaae42ccb6f787ef5eefdcdd166ea3d628fa8476abe712144" ], - "markers": "python_version >= '3.5'", + "markers": "python_full_version >= '3.5.0'", "version": "==1.10" }, "attrs": { @@ -40,6 +40,13 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==21.4.0" }, + "authlib": { + "hashes": [ + "sha256:b83cf6360c8e92b0e9df0d1f32d675790bcc4e3c03977499b1eed24dcdef4252", + "sha256:ecf4a7a9f2508c0bb07e93a752dd3c495cfaffc20e864ef0ffc95e3f40d2abaf" + ], + "version": "==0.15.5" + }, "beautifulsoup4": { "hashes": [ "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30", @@ -50,19 +57,19 @@ }, "boto3": { "hashes": [ - "sha256:13efff22f1cb6d25ec7027edaccdfdd515ba593e093173beb09094cff898a8cc", - "sha256:945d49941541a3cbb02710361be64b22f98e68c2e447229f0d51f7c215009e28" + "sha256:a547880008f0031834fe0122e91cc064438f54d15b9c34729672c53203a0c740", + "sha256:bcbf31eff02bc01f9c55e2d428b4f6a27701c86b4600cbe4e9d45aa1dd61f036" ], "index": "pypi", - "version": "==1.24.13" + "version": "==1.24.17" }, "botocore": { "hashes": [ - "sha256:df75e53576b061818bbce4bd70221749e40cc91d16a2b6c03fbeec8023665734", - "sha256:fbc09558c02d415e8646520f95db7e8d313460938780fa6040b00865f098fd55" + "sha256:af9d44592b4d0d6509b355b2ec5cb14fd23eadf7c33d13b880266dede22759ac", + "sha256:baf60b803ffd7b1dbc9c93dd2049fe2372699e4c993c9d33713667acdea64d1f" ], "markers": "python_version >= '3.7'", - "version": "==1.27.13" + "version": "==1.27.17" }, "brotli": { "hashes": [ @@ -215,7 +222,7 @@ "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" ], - "markers": "python_version >= '3.5'", + "markers": "python_full_version >= '3.5.0'", "version": "==2.0.12" }, "click": { @@ -265,8 +272,17 @@ "sha256:f224ad253cc9cea7568f49077007d2263efa57396a2f2f78114066fd54b5c68e", "sha256:f8ec91983e638a9bcd75b39f1396e5c0dc2330cbd9ce4accefe68717e6779e0a" ], + "markers": "python_version >= '3.6'", "version": "==37.0.2" }, + "dataclasses-json": { + "hashes": [ + "sha256:bc285b5f892094c3a53d558858a88553dd6a61a11ab1a8128a0e554385dcc5dd", + "sha256:c2c11bc8214fbf709ffc369d11446ff6945254a7f09128154a7620613d8fda90" + ], + "markers": "python_version >= '3.6'", + "version": "==0.5.7" + }, "dateparser": { "hashes": [ "sha256:038196b1f12c7397e38aad3d61588833257f6f552baa63a1499e6987fa8d42d9", @@ -348,11 +364,11 @@ }, "googleapis-common-protos": { "hashes": [ - "sha256:023eaea9d8c1cceccd9587c6af6c20f33eeeb05d4148670f2b0322dc1511700c", - "sha256:b09b56f5463070c2153753ef123f07d2e49235e89148e9b2459ec8ed2f68d7d3" + "sha256:6f1369b58ed6cf3a4b7054a44ebe8d03b29c309257583a2bbdc064cd1e4a1442", + "sha256:87955d7b3a73e6e803f2572a33179de23989ebba725e05ea42f24838b792e461" ], "markers": "python_version >= '3.6'", - "version": "==1.56.2" + "version": "==1.56.3" }, "gspread": { "hashes": [ @@ -383,16 +399,16 @@ "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" ], - "markers": "python_version >= '3.5'", + "markers": "python_full_version >= '3.5.0'", "version": "==3.3" }, "importlib-metadata": { "hashes": [ - "sha256:5d26852efe48c0a32b0509ffbc583fda1a2266545a78d104a6f4aff3db17d700", - "sha256:c58c8eb8a762858f49e18436ff552e83914778e50e9d2f1660535ffb364552ec" + "sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670", + "sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23" ], "markers": "python_version < '3.10'", - "version": "==4.11.4" + "version": "==4.12.0" }, "itsdangerous": { "hashes": [ @@ -541,14 +557,36 @@ "markers": "python_version >= '3.7'", "version": "==2.1.1" }, + "marshmallow": { + "hashes": [ + "sha256:53a1e0ee69f79e1f3e80d17393b25cfc917eda52f859e8183b4af72c3390c1f1", + "sha256:a762c1d8b2bcb0e5c8e964850d03f9f3bffd6a12b626f3c14b9d6b1841999af5" + ], + "markers": "python_version >= '3.7'", + "version": "==3.16.0" + }, + "marshmallow-enum": { + "hashes": [ + "sha256:38e697e11f45a8e64b4a1e664000897c659b60aa57bfa18d44e226a9920b6e58", + "sha256:57161ab3dbfde4f57adeb12090f39592e992b9c86d206d02f6bd03ebec60f072" + ], + "version": "==1.5.1" + }, "mutagen": { "hashes": [ "sha256:6397602efb3c2d7baebd2166ed85731ae1c1d475abca22090b7141ff5034b3e1", "sha256:9c9f243fcec7f410f138cb12c21c84c64fde4195481a30c9bfb05b5f003adfed" ], - "markers": "python_version >= '3.5' and python_version < '4'", + "markers": "python_version < '4' and python_full_version >= '3.5.0'", "version": "==1.45.1" }, + "mypy-extensions": { + "hashes": [ + "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d", + "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8" + ], + "version": "==0.4.3" + }, "oauth2client": { "hashes": [ "sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac", @@ -573,35 +611,33 @@ "markers": "python_version >= '3.7'", "version": "==1.2.0" }, + "packaging": { + "hashes": [ + "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", + "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522" + ], + "markers": "python_version >= '3.6'", + "version": "==21.3" + }, "protobuf": { "hashes": [ - "sha256:06059eb6953ff01e56a25cd02cca1a9649a75a7e65397b5b9b4e929ed71d10cf", - "sha256:097c5d8a9808302fb0da7e20edf0b8d4703274d140fd25c5edabddcde43e081f", - "sha256:284f86a6207c897542d7e956eb243a36bb8f9564c1742b253462386e96c6b78f", - "sha256:32ca378605b41fd180dfe4e14d3226386d8d1b002ab31c969c366549e66a2bb7", - "sha256:3cc797c9d15d7689ed507b165cd05913acb992d78b379f6014e013f9ecb20996", - "sha256:62f1b5c4cd6c5402b4e2d63804ba49a327e0c386c99b1675c8a0fefda23b2067", - "sha256:69ccfdf3657ba59569c64295b7d51325f91af586f8d5793b734260dfe2e94e2c", - "sha256:6f50601512a3d23625d8a85b1638d914a0970f17920ff39cec63aaef80a93fb7", - "sha256:7403941f6d0992d40161aa8bb23e12575637008a5a02283a930addc0508982f9", - "sha256:755f3aee41354ae395e104d62119cb223339a8f3276a0cd009ffabfcdd46bb0c", - "sha256:77053d28427a29987ca9caf7b72ccafee011257561259faba8dd308fda9a8739", - "sha256:7e371f10abe57cee5021797126c93479f59fccc9693dafd6bd5633ab67808a91", - "sha256:9016d01c91e8e625141d24ec1b20fed584703e527d28512aa8c8707f105a683c", - "sha256:9be73ad47579abc26c12024239d3540e6b765182a91dbc88e23658ab71767153", - "sha256:adc31566d027f45efe3f44eeb5b1f329da43891634d61c75a5944e9be6dd42c9", - "sha256:adfc6cf69c7f8c50fd24c793964eef18f0ac321315439d94945820612849c388", - "sha256:af0ebadc74e281a517141daad9d0f2c5d93ab78e9d455113719a45a49da9db4e", - "sha256:cb29edb9eab15742d791e1025dd7b6a8f6fcb53802ad2f6e3adcb102051063ab", - "sha256:cd68be2559e2a3b84f517fb029ee611546f7812b1fdd0aa2ecc9bc6ec0e4fdde", - "sha256:cdee09140e1cd184ba9324ec1df410e7147242b94b5f8b0c64fc89e38a8ba531", - "sha256:db977c4ca738dd9ce508557d4fce0f5aebd105e158c725beec86feb1f6bc20d8", - "sha256:dd5789b2948ca702c17027c84c2accb552fc30f4622a98ab5c51fcfe8c50d3e7", - "sha256:e250a42f15bf9d5b09fe1b293bdba2801cd520a9f5ea2d7fb7536d4441811d20", - "sha256:ff8d8fa42675249bb456f5db06c00de6c2f4c27a065955917b28c4f15978b9c3" + "sha256:095fda15fe04a79c9f0edab09b424be46dd057b15986d235b84c8cea91659df7", + "sha256:29eaf8e9db33bc3bae14576ad61370aa2b64ea5d6e6cd705042692e5e0404b10", + "sha256:4758b9c22ad0486639a68cea58d38571f233019a73212d78476ec648f68a49a3", + "sha256:57a593e40257ab4f164fe6e171651b1386c98f8ec5f5a8643642889c50d4f3c4", + "sha256:5f8c7488e74024fa12b46aab4258f707d7d6e94c8d322d7c45cc13770f66ab59", + "sha256:7b2dcca25d88ec77358eed3d031c8260b5bf3023fff03a31c9584591c5910833", + "sha256:853708afc3a7eed4df28a8d4bd4812f829f8d736c104dd8d584ccff27969e311", + "sha256:863f65e137d9de4a76cac39ae731a19bea1c30997f512ecf0dc9348112313401", + "sha256:9b42afb67e19010cdda057e439574ccd944902ea14b0d52ba0bfba2aad50858d", + "sha256:b82ac05b0651a4d2b9d56f5aeef3d711f5858eb4b71c13d77553739e5930a74a", + "sha256:d622dc75e289e8b3031dd8b4e87df508f11a6b3d86a49fb50256af7ce030d35b", + "sha256:e3d3df3292ab4bae85213b9ebef566b5aedb45f97425a92fac5b2e431d31e71c", + "sha256:ef0768a609a02b2b412fa0f59f1242f1597e9bb15188d043f3fde09115ca6c69", + "sha256:f2f43ae8dff452aee3026b59ea0a09245ab2529a55a0984992e76bcf848610e1" ], "markers": "python_version >= '3.7'", - "version": "==3.20.1" + "version": "==4.21.2" }, "pyaes": { "hashes": [ @@ -654,36 +690,39 @@ }, "pycryptodomex": { "hashes": [ - "sha256:1ca8e1b4c62038bb2da55451385246f51f412c5f5eabd64812c01766a5989b4a", - "sha256:298c00ea41a81a491d5b244d295d18369e5aac4b61b77b2de5b249ca61cd6659", - "sha256:2aa887683eee493e015545bd69d3d21ac8d5ad582674ec98f4af84511e353e45", - "sha256:2ce76ed0081fd6ac8c74edc75b9d14eca2064173af79843c24fa62573263c1f2", - "sha256:3da13c2535b7aea94cc2a6d1b1b37746814c74b6e80790daddd55ca5c120a489", - "sha256:406ec8cfe0c098fadb18d597dc2ee6de4428d640c0ccafa453f3d9b2e58d29e2", - "sha256:4d0db8df9ffae36f416897ad184608d9d7a8c2b46c4612c6bc759b26c073f750", - "sha256:530756d2faa40af4c1f74123e1d889bd07feae45bac2fd32f259a35f7aa74151", - "sha256:77931df40bb5ce5e13f4de2bfc982b2ddc0198971fbd947776c8bb5050896eb2", - "sha256:797a36bd1f69df9e2798e33edb4bd04e5a30478efc08f9428c087f17f65a7045", - "sha256:8085bd0ad2034352eee4d4f3e2da985c2749cb7344b939f4d95ead38c2520859", - "sha256:8536bc08d130cae6dcba1ea689f2913dfd332d06113904d171f2f56da6228e89", - "sha256:a4d412eba5679ede84b41dbe48b1bed8f33131ab9db06c238a235334733acc5e", - "sha256:aebecde2adc4a6847094d3bd6a8a9538ef3438a5ea84ac1983fcb167db614461", - "sha256:b276cc4deb4a80f9dfd47a41ebb464b1fe91efd8b1b8620cf5ccf8b824b850d6", - "sha256:b5a185ae79f899b01ca49f365bdf15a45d78d9856f09b0de1a41b92afce1a07f", - "sha256:c4d8977ccda886d88dc3ca789de2f1adc714df912ff3934b3d0a3f3d777deafb", - "sha256:c5dd3ffa663c982d7f1be9eb494a8924f6d40e2e2f7d1d27384cfab1b2ac0662", - "sha256:ca88f2f7020002638276439a01ffbb0355634907d1aa5ca91f3dc0c2e44e8f3b", - "sha256:d2cce1c82a7845d7e2e8a0956c6b7ed3f1661c9acf18eb120fc71e098ab5c6fe", - "sha256:d709572d64825d8d59ea112e11cc7faf6007f294e9951324b7574af4251e4de8", - "sha256:da8db8374295fb532b4b0c467e66800ef17d100e4d5faa2bbbd6df35502da125", - "sha256:e36c7e3b5382cd5669cf199c4a04a0279a43b2a3bdd77627e9b89778ac9ec08c", - "sha256:e95a4a6c54d27a84a4624d2af8bb9ee178111604653194ca6880c98dcad92f48", - "sha256:ee835def05622e0c8b1435a906491760a43d0c462f065ec9143ec4b8d79f8bff", - "sha256:f75009715dcf4a3d680c2338ab19dac5498f8121173a929872950f4fb3a48fbf", - "sha256:f8524b8bc89470cec7ac51734907818d3620fb1637f8f8b542d650ebec42a126" + "sha256:04cc393045a8f19dd110c975e30f38ed7ab3faf21ede415ea67afebd95a22380", + "sha256:0776bfaf2c48154ab54ea45392847c1283d2fcf64e232e85565f858baedfc1fa", + "sha256:0fadb9f7fa3150577800eef35f62a8a24b9ddf1563ff060d9bd3af22d3952c8c", + "sha256:18e2ab4813883ae63396c0ffe50b13554b32bb69ec56f0afaf052e7a7ae0d55b", + "sha256:191e73bc84a8064ad1874dba0ebadedd7cce4dedee998549518f2c74a003b2e1", + "sha256:35a8f7afe1867118330e2e0e0bf759c409e28557fb1fc2fbb1c6c937297dbe9a", + "sha256:3709f13ca3852b0b07fc04a2c03b379189232b24007c466be0f605dd4723e9d4", + "sha256:4540904c09704b6f831059c0dfb38584acb82cb97b0125cd52688c1f1e3fffa6", + "sha256:463119d7d22d0fc04a0f9122e9d3e6121c6648bcb12a052b51bd1eed1b996aa2", + "sha256:46b3f05f2f7ac7841053da4e0f69616929ca3c42f238c405f6c3df7759ad2780", + "sha256:48697790203909fab02a33226fda546604f4e2653f9d47bc5d3eb40879fa7c64", + "sha256:5676a132169a1c1a3712edf25250722ebc8c9102aa9abd814df063ca8362454f", + "sha256:65204412d0c6a8e3c41e21e93a5e6054a74fea501afa03046a388cf042e3377a", + "sha256:67e1e6a92151023ccdfcfbc0afb3314ad30080793b4c27956ea06ab1fb9bcd8a", + "sha256:6f5b6ba8aefd624834bc177a2ac292734996bb030f9d1b388e7504103b6fcddf", + "sha256:7341f1bb2dadb0d1a0047f34c3a58208a92423cdbd3244d998e4b28df5eac0ed", + "sha256:78d9621cf0ea35abf2d38fa2ca6d0634eab6c991a78373498ab149953787e5e5", + "sha256:8eecdf9cdc7343001d047f951b9cc805cd68cb6cd77b20ea46af5bffc5bd3dfb", + "sha256:94c7b60e1f52e1a87715571327baea0733708ab4723346598beca4a3b6879794", + "sha256:996e1ba717077ce1e6d4849af7a1426f38b07b3d173b879e27d5e26d2e958beb", + "sha256:a07a64709e366c2041cd5cfbca592b43998bf4df88f7b0ca73dca37071ccf1bd", + "sha256:b6306403228edde6e289f626a3908a2f7f67c344e712cf7c0a508bab3ad9e381", + "sha256:b9279adc16e4b0f590ceff581f53a80179b02cba9056010d733eb4196134a870", + "sha256:c4cb9cb492ea7dcdf222a8d19a1d09002798ea516aeae8877245206d27326d86", + "sha256:dd452a5af7014e866206d41751886c9b4bf379a339fdf2dbfc7dd16c0fb4f8e0", + "sha256:e2b12968522a0358b8917fc7b28865acac002f02f4c4c6020fcb264d76bfd06d", + "sha256:e3164a18348bd53c69b4435ebfb4ac8a4076291ffa2a70b54f0c4b80c7834b1d", + "sha256:e47bf8776a7e15576887f04314f5228c6527b99946e6638cf2f16da56d260cab", + "sha256:f8be976cec59b11f011f790b88aca67b4ea2bd286578d0bd3e31bcd19afcd3e4", + "sha256:fc9bc7a9b79fe5c750fc81a307052f8daabb709bdaabb0fb18fb136b66b653b5" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==3.14.1" + "version": "==3.15.0" }, "pygments": { "hashes": [ @@ -729,7 +768,7 @@ "sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f", "sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938" ], - "markers": "python_version >= '3.5'", + "markers": "python_full_version >= '3.5.0'", "version": "==0.20.0" }, "python-slugify": { @@ -740,6 +779,14 @@ "index": "pypi", "version": "==6.1.2" }, + "python-twitter-v2": { + "hashes": [ + "sha256:0b6ab9abff4bc447ece4a2cc2439bc8776d306a3415a73d89013436e9a77894d", + "sha256:f9fabdb2b34e7c49e9014e3acfd52ae5199248e8948567033fb4b73b927cfb0d" + ], + "index": "pypi", + "version": "==0.7.7" + }, "pytz": { "hashes": [ "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7", @@ -923,10 +970,10 @@ }, "selenium": { "hashes": [ - "sha256:ba5b2633f43cf6fe9d308fa4a6996e00a101ab9cb1aad6fd91ae1f3dbe57f56f" + "sha256:f67402b8f973aaa98d9c55b8f9aa63532009cd1859b2222a8b9800354942d8bc" ], "index": "pypi", - "version": "==4.2.0" + "version": "==4.3.0" }, "six": { "hashes": [ @@ -941,7 +988,7 @@ "sha256:471b71698eac1c2112a40ce2752bb2f4a4814c22a54a3eed3676bc0f5ca9f663", "sha256:c4666eecec1d3f50960c6bdf61ab7bc350648da6c126e3cf6898d8cd4ddcd3de" ], - "markers": "python_version >= '3.5'", + "markers": "python_full_version >= '3.5.0'", "version": "==1.2.0" }, "snscrape": { @@ -1010,9 +1057,25 @@ "sha256:5b558f6e83cc20a37c3b61202476c5295d1addf57bd65543364e0337e37ed2bc", "sha256:a3d34de8fac26023eee701ed1e7bf4da9a8326b61a62934ec9e53b64970fd8fe" ], - "markers": "python_version >= '3.5'", + "markers": "python_full_version >= '3.5.0'", "version": "==0.9.2" }, + "typing-extensions": { + "hashes": [ + "sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708", + "sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376" + ], + "markers": "python_version >= '3.7'", + "version": "==4.2.0" + }, + "typing-inspect": { + "hashes": [ + "sha256:047d4097d9b17f46531bf6f014356111a1b6fb821a24fe7ac909853ca2a782aa", + "sha256:3cd7d4563e997719a710a3bfe7ffb544c6b72069b6812a02e9b414a8fa3aaa6b", + "sha256:b1f56c0783ef0f25fb064a01be6e5407e54cf4a4bf4f3ba3fe51e0bd6dcea9e5" + ], + "version": "==0.7.1" + }, "tzdata": { "hashes": [ "sha256:238e70234214138ed7b4e8a0fab0e5e13872edab3be586ab8198c407620e2ab9", @@ -1038,7 +1101,10 @@ "version": "==4.1.1" }, "urllib3": { - "extras": [], + "extras": [ + "secure", + "socks" + ], "hashes": [ "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" @@ -1055,11 +1121,11 @@ }, "vk-url-scraper": { "hashes": [ - "sha256:181c8a4b69e395a68bdf00e3dc1717e5218960c9fda6e90eea9633ff26fc9257", - "sha256:9cfc6bc3d7259f18508c3822955efac21ff9bad5bd886010b10f098ea10ad551" + "sha256:7caf8d788fc268d311b13c06ff0cbd9413dd8978f463af970459b9e7e2f42ba5", + "sha256:c4593d86b5096e75e2845e4838f46ce2cf0ac34b2fe1c4476d2eeb6744b18a11" ], "index": "pypi", - "version": "==0.3.2" + "version": "==0.3.5" }, "websockets": { "hashes": [ diff --git a/README.md b/README.md index ddbe457..7604d83 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,7 @@ graph TD A -->|parent of| F(TwitterArchiver) A -->|parent of| G(VkArchiver) A -->|parent of| H(WaybackArchiver) + F -->|parent of| I(TwitterApiArchiver) ``` ### Current Storages ```mermaid diff --git a/archivers/__init__.py b/archivers/__init__.py index 33700d1..403ebea 100644 --- a/archivers/__init__.py +++ b/archivers/__init__.py @@ -6,4 +6,5 @@ from .tiktok_archiver import TiktokArchiver from .wayback_archiver import WaybackArchiver from .youtubedl_archiver import YoutubeDLArchiver from .twitter_archiver import TwitterArchiver -from .vk_archiver import VkArchiver \ No newline at end of file +from .vk_archiver import VkArchiver +from .twitter_api_archiver import TwitterApiArchiver \ No newline at end of file diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 4a06475..70be680 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -149,9 +149,13 @@ class Archiver(ABC): if a string is passed in @with_extension the slug is appended with it if there is no "." in the slug if @append_date is true, the key adds a timestamp after the URL slug and before the extension """ - slug = slugify(urlparse(url).path) + url_path = urlparse(url).path + path, ext = os.path.splitext(url_path) + slug = slugify(path) if append_datetime: slug += "-" + slugify(datetime.datetime.utcnow().isoformat()) + if len(ext): + slug += ext if with_extension is not None: if "." not in slug: slug += with_extension diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 166996c..c360634 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -41,7 +41,7 @@ class TelethonArchiver(Archiver): def download(self, url, check_if_exists=False): if not hasattr(self, "client"): - logger.error('Missing Telethon config') + logger.warning('Missing Telethon config') return False # detect URLs that we definitely cannot handle @@ -80,7 +80,6 @@ class TelethonArchiver(Archiver): if check_if_exists and self.storage.exists(key): # only s3 storage supports storage.exists as not implemented on gd cdn_url = self.storage.get_cdn_url(key) - status = 'already archived' return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot) group_id = post.grouped_id if post.grouped_id is not None else post.id diff --git a/archivers/twitter_api_archiver.py b/archivers/twitter_api_archiver.py new file mode 100644 index 0000000..ef2bf40 --- /dev/null +++ b/archivers/twitter_api_archiver.py @@ -0,0 +1,73 @@ + +import json +from datetime import datetime +from loguru import logger +from pytwitter import Api + +from storages.base_storage import Storage +from configs import TwitterApiConfig +from .base_archiver import ArchiveResult +from .twitter_archiver import TwitterArchiver + + +class TwitterApiArchiver(TwitterArchiver): + name = "twitter_api" + + def __init__(self, storage: Storage, driver, config: TwitterApiConfig): + super().__init__(storage, driver) + + if config.bearer_token: + self.api = Api(bearer_token=config.bearer_token) + elif config.consumer_key and config.consumer_secret and config.access_token and config.access_secret: + self.api = Api( + consumer_key=config.consumer_key, consumer_secret=config.consumer_secret, access_token=config.access_token, access_secret=config.access_secret) + + def download(self, url, check_if_exists=False): + if not hasattr(self, "api"): + logger.warning('Missing Twitter API config') + return False + + username, tweet_id = self.get_username_tweet_id(url) + if not username: return False + + tweet = self.api.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"]) + timestamp = datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ") + + # check if exists + key = self.get_html_key(url) + if check_if_exists and self.storage.exists(key): + # only s3 storage supports storage.exists as not implemented on gd + cdn_url = self.storage.get_cdn_url(key) + screenshot = self.get_screenshot(url) + return ArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot) + + urls = [] + if tweet.includes: + for m in tweet.includes.media: + if m.url: + urls.append(m.url) + elif hasattr(m, "variants"): + var_url = self.choose_variant(m.variants) + urls.append(var_url) + else: + urls.append(None) # will trigger error + + for u in urls: + if u is None: + logger.error(f"Should not have gotten None url for {tweet.includes.media=}") + return self.download_alternative(url, tweet_id) + logger.debug(f"found {urls=}") + + output = json.dumps({ + "id": tweet.data.id, + "text": tweet.data.text, + "created_at": tweet.data.created_at, + "author_id": tweet.data.author_id, + "geo": tweet.data.geo, + "lang": tweet.data.lang, + "media": urls + }, ensure_ascii=False, indent=4) + + screenshot = self.get_screenshot(url) + page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text) diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 29c43fe..a874a1d 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -1,6 +1,5 @@ - -import html -from urllib.parse import urlparse +import html, re, requests +from datetime import datetime from loguru import logger from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo @@ -9,20 +8,21 @@ from .base_archiver import Archiver, ArchiveResult class TwitterArchiver(Archiver): name = "twitter" + link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") + + def get_username_tweet_id(self, url): + # detect URLs that we definitely cannot handle + matches = self.link_pattern.findall(url) + if not len(matches): return False, False + + username, tweet_id = matches[0] # only one URL supported + logger.debug(f"Found {username=} and {tweet_id=} in {url=}") + + return username, tweet_id def download(self, url, check_if_exists=False): - - if 'twitter.com' != self.get_netloc(url): - logger.debug(f'{url=} is not from twitter') - return False - - tweet_id = urlparse(url).path.split('/') - if 'status' in tweet_id: - i = tweet_id.index('status') - tweet_id = tweet_id[i + 1] - else: - logger.debug(f'{url=} does not contain "status"') - return False + username, tweet_id = self.get_username_tweet_id(url) + if not username: return False scr = TwitterTweetScraper(tweet_id) @@ -30,7 +30,7 @@ class TwitterArchiver(Archiver): tweet = next(scr.get_items()) except Exception as ex: logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}") - return False + return self.download_alternative(url, tweet_id) if tweet.media is None: logger.debug(f'No media found, archiving tweet text only') @@ -57,3 +57,40 @@ class TwitterArchiver(Archiver): screenshot = self.get_screenshot(url) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content) + + def download_alternative(self, url, tweet_id): + logger.debug(f"Trying twitter hack for {url=}") + hack_url = f"https://cdn.syndication.twimg.com/tweet?id={tweet_id}" + r = requests.get(hack_url) + if r.status_code != 200: return False + tweet = r.json() + + urls = [] + for p in tweet["photos"]: + urls.append(p["url"]) + + # 1 tweet has 1 video max + v = tweet["video"] + urls.append(self.choose_variant(v.get("variants", []))) + + logger.debug(f"Twitter hack got {urls=}") + + timestamp = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") + screenshot = self.get_screenshot(url) + page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"]) + + def choose_variant(self, variants): + # choosing the highest quality possible + variant, width, height = None, 0, 0 + for var in variants: + if var["type"] == "video/mp4": + width_height = re.search(r"\/(\d+)x(\d+)\/", var["src"]) + if width_height: + w, h = int(width_height[1]), int(width_height[2]) + if w > width or h > height: + width, height = w, h + variant = var.get("src", variant) + else: + variant = var.get("src") if not variant else variant + return variant diff --git a/auto_archive.py b/auto_archive.py index a90056b..375c5be 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -3,7 +3,7 @@ import os, datetime, traceback, random, tempfile from loguru import logger from slugify import slugify -from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver +from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver from utils import GWorksheet, mkdir_if_not_exists, expand_url from configs import Config from storages import Storage @@ -92,6 +92,7 @@ def process_sheet(c: Config): active_archivers = [ TelethonArchiver(storage, c.webdriver, c.telegram_config), TiktokArchiver(storage, c.webdriver), + TwitterApiArchiver(storage, c.webdriver, c.twitter_config), YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), TelegramArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver), diff --git a/configs/__init__.py b/configs/__init__.py index f70c9c6..6940ed3 100644 --- a/configs/__init__.py +++ b/configs/__init__.py @@ -2,4 +2,5 @@ from .config import Config from .selenium_config import SeleniumConfig from .telethon_config import TelethonConfig from .wayback_config import WaybackConfig +from .twitter_api_config import TwitterApiConfig from .vk_config import VkConfig \ No newline at end of file diff --git a/configs/config.py b/configs/config.py index 61928b2..4232651 100644 --- a/configs/config.py +++ b/configs/config.py @@ -11,7 +11,8 @@ from .wayback_config import WaybackConfig from .telethon_config import TelethonConfig from .selenium_config import SeleniumConfig from .vk_config import VkConfig -from storages import Storage, S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig +from .twitter_api_config import TwitterApiConfig +from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig class Config: @@ -135,6 +136,19 @@ class Config: self.telegram_config = None logger.debug(f"'telegram' key not present in the {self.config_file=}") + # twitter config + if "twitter" in secrets: + self.twitter_config = TwitterApiConfig( + bearer_token=secrets["twitter"].get("bearer_token"), + consumer_key=secrets["twitter"].get("consumer_key"), + consumer_secret=secrets["twitter"].get("consumer_secret"), + access_token=secrets["twitter"].get("access_token"), + access_secret=secrets["twitter"].get("access_secret"), + ) + else: + self.twitter_config = None + logger.debug(f"'twitter' key not present in the {self.config_file=}") + # vk config if "vk" in secrets: self.vk_config = VkConfig( @@ -223,12 +237,11 @@ class Config: self.destroy_webdriver() self.webdriver = new_webdriver self.webdriver.set_window_size(self.selenium_config.window_width, - self.selenium_config.window_height) + self.selenium_config.window_height) self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds) except TimeoutException as e: logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}") - def __str__(self) -> str: return json.dumps({ "config_file": self.config_file, @@ -245,6 +258,7 @@ class Config: "local_config": hasattr(self, "local_config"), "wayback_config": self.wayback_config != None, "telegram_config": self.telegram_config != None, + "twitter_config": self.twitter_config != None, "vk_config": self.vk_config != None, "gsheets_client": self.gsheets_client != None, "column_names": self.column_names, diff --git a/configs/twitter_api_config.py b/configs/twitter_api_config.py new file mode 100644 index 0000000..4193111 --- /dev/null +++ b/configs/twitter_api_config.py @@ -0,0 +1,11 @@ + +from dataclasses import dataclass + + +@dataclass +class TwitterApiConfig: + bearer_token: str + consumer_key: str + consumer_secret: str + access_token: str + access_secret: str diff --git a/example.config.yaml b/example.config.yaml index 3381486..cdc0e91 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -8,7 +8,7 @@ secrets: key: "s3 API key" secret: "s3 API secret" # use region format like such - endpoint_url: 'https://{region}.digitaloceanspaces.com' + endpoint_url: "https://{region}.digitaloceanspaces.com" #use bucket, region, and key (key is the archived file path generated when executing) format like such as: cdn_url: "https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}" # if private:true S3 urls will not be readable online @@ -24,7 +24,7 @@ secrets: # needed if you use storage=local local: - # local path to save files in + # local path to save files in save_to: "./local_archive" wayback: @@ -34,11 +34,20 @@ secrets: telegram: # to get credentials see: https://telegra.ph/How-to-get-Telegram-APP-ID--API-HASH-05-27 - api_id: your API key, see + api_id: your API key, see api_hash: your API hash # optional, but allows access to more content such as large videos, talk to @botfather bot_token: your bot-token + twitter: + # twitter configuration - API V2 only - either bearer_token only + bearer_token: "" + # OR all of the below + consumer_key: "" + consumer_secret: "" + access_token: "" + access_secret: "" + # vkontakte (vk.com) credentials vk: username: "phone number or email" @@ -49,7 +58,7 @@ secrets: service_account: "service_account.json" facebook: - # optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx' + # optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx' cookie: "" execution: # can be overwritten with CMD --sheet= From 179528562bda9c3e1d1f4d58650fa7bf6511bb71 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 27 Jun 2022 01:07:59 +0200 Subject: [PATCH 2/4] minor updates --- archivers/twitter_archiver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index a874a1d..b70d9f3 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -59,6 +59,7 @@ class TwitterArchiver(Archiver): return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content) def download_alternative(self, url, tweet_id): + # https://stackoverflow.com/a/71867055/6196010 logger.debug(f"Trying twitter hack for {url=}") hack_url = f"https://cdn.syndication.twimg.com/tweet?id={tweet_id}" r = requests.get(hack_url) From 34536e7f1493ceeaf3629e009027a82a45721415 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 27 Jun 2022 11:17:23 +0200 Subject: [PATCH 3/4] added explanation for 2 twitter archivers --- README.md | 3 +++ archivers/twitter_archiver.py | 4 ++++ example.config.yaml | 4 +++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7604d83..ef6836d 100644 --- a/README.md +++ b/README.md @@ -151,6 +151,9 @@ Code is split into functional concepts: ### Current Archivers Archivers are tested in a meaningful order with Wayback Machine being the failsafe, that can easily be changed in the code. + +> Note: We have 2 Twitter Archivers (`TwitterArchiver`, `TwitterApiArchiver`) because one requires Twitter API V2 credentials and has better results and the other does not rely on official APIs and misses out on some content. + ```mermaid graph TD A(Archiver) -->|parent of| B(TelethonArchiver) diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index b70d9f3..08f7118 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -7,6 +7,10 @@ from .base_archiver import Archiver, ArchiveResult class TwitterArchiver(Archiver): + """ + This Twitter Archiver uses unofficial scraping methods, and it works as + an alternative to TwitterApiArchiver when no API credentials are provided. + """ name = "twitter" link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") diff --git a/example.config.yaml b/example.config.yaml index cdc0e91..c5b6a76 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -39,8 +39,10 @@ secrets: # optional, but allows access to more content such as large videos, talk to @botfather bot_token: your bot-token + # twitter configuration - API V2 only + # if you don't provide credentials the less-effective unofficial TwitterArchiver will be used instead twitter: - # twitter configuration - API V2 only - either bearer_token only + # either bearer_token only bearer_token: "" # OR all of the below consumer_key: "" From 4b423dfc34ec82c6f7678c59fe5f193f1bb7f028 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 27 Jun 2022 14:36:58 +0200 Subject: [PATCH 4/4] fix telethon exception --- archivers/base_archiver.py | 17 ++++++++--------- archivers/telethon_archiver.py | 23 +++++++++++++++++++++-- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 70be680..815d31e 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -93,16 +93,19 @@ class Archiver(ABC): return mime.split("/")[0] return "" - # eg images in a tweet save to cloud storage + def download_from_url(self, url, to_filename): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' + } + d = requests.get(url, headers=headers) + with open(to_filename, 'wb') as f: + f.write(d.content) def generate_media_page(self, urls, url, object): """ For a list of media urls, fetch them, upload them and call self.generate_media_page_html with them """ - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' - } thumbnail = None uploaded_media = [] @@ -110,11 +113,7 @@ class Archiver(ABC): key = self._get_key_from_url(media_url, ".jpg") filename = os.path.join(Storage.TMP_FOLDER, key) - - d = requests.get(media_url, headers=headers) - with open(filename, 'wb') as f: - f.write(d.content) - + self.download_from_url(media_url, filename) self.storage.upload(filename, key) hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index c360634..f35e323 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -82,19 +82,38 @@ class TelethonArchiver(Archiver): cdn_url = self.storage.get_cdn_url(key) return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot) + key_thumb, thumb_index = None, None group_id = post.grouped_id if post.grouped_id is not None else post.id uploaded_media = [] message = post.message - for i, mp in enumerate(media_posts): + for mp in media_posts: if len(mp.message) > len(message): message = mp.message + + # media can also be in entities + if mp.entities: + other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image"]] + logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") + for om_url in other_media_urls: + filename = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}_{self._get_key_from_url(om_url)}') + self.download_from_url(om_url, filename) + key = filename.split(Storage.TMP_FOLDER)[1] + self.storage.upload(filename, key) + hash = self.get_hash(filename) + cdn_url = self.storage.get_cdn_url(key) + uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) + filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', str(mp.id)) filename = self.client.download_media(mp.media, filename_dest) + if not filename: + logger.debug(f"Empty media found, skipping {str(mp)=}") + continue + key = filename.split(Storage.TMP_FOLDER)[1] self.storage.upload(filename, key) hash = self.get_hash(filename) cdn_url = self.storage.get_cdn_url(key) uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) - if i == 0: + if key_thumb is None: key_thumb, thumb_index = self.get_thumbnails(filename, key) os.remove(filename)