Compare commits

..

16 Commits

Author SHA1 Message Date
msramalho
f4827770e6 adds instagram no stories as success, and fix for telethon-based archivers. 2024-03-05 14:49:10 +00:00
msramalho
601572d76e strip url 2024-02-29 11:54:01 +00:00
msramalho
d21e79a272 general security updates 2024-02-29 11:40:30 +00:00
msramalho
ccf5f857ef adds configurable limits to instagram/youtube 2024-02-25 15:14:17 +00:00
msramalho
7de317d1b5 avoiding exception 2024-02-23 15:54:33 +00:00
msramalho
70075a1e5e improving insta archiver 2024-02-23 15:37:28 +00:00
msramalho
5b9bc4919a version bump 2024-02-23 14:08:23 +00:00
msramalho
f0158ffd9c adds tagged posts and better parsing 2024-02-23 14:08:17 +00:00
msramalho
bfb35a43a9 adds more details from yt-dlp 2024-02-23 14:08:05 +00:00
msramalho
ef5b39c4f1 dind exception 2024-02-22 18:05:56 +00:00
msramalho
24ceafcb64 missing forward slash 2024-02-22 17:47:13 +00:00
msramalho
9fd4bb56a8 new attempt at dind wacz 2024-02-22 17:24:27 +00:00
msramalho
5324d562ba cleanup wacz patch 2024-02-21 18:14:30 +00:00
msramalho
5bf0a0206d version update 2024-02-21 17:26:07 +00:00
msramalho
4941823565 fix growing volume size in wacz_enricher 2024-02-21 17:25:55 +00:00
msramalho
27310c2911 fixes issue with api requests 2024-02-21 12:25:05 +00:00
15 changed files with 247 additions and 123 deletions

141
Pipfile.lock generated
View File

@@ -172,20 +172,20 @@
},
"boto3": {
"hashes": [
"sha256:46432fd506708fec6caec4392d758c6f5b79a376dee67d3284fe8b6bfbafeaf4",
"sha256:5c96bed1269f77788780aa2005811dc3a37d4122f08b8e54063a1f4c1b9314a1"
"sha256:66303b5f26d92afb72656ff490b22ea72dfff8bf1a29e4a0c5d5f11ec56245dd",
"sha256:898ad2123b18cae8efd85adc56ac2d1925be54592aebc237020d4f16e9a9e7a9"
],
"index": "pypi",
"markers": "python_version >= '3.8'",
"version": "==1.34.45"
"version": "==1.34.52"
},
"botocore": {
"hashes": [
"sha256:bf4fe24dd00a6262a27573dea1690ea68eb20f939e7086effadf19aa1acb44d1",
"sha256:e17874ac708fef295d2ea16bb2570ea0512c920de9f25f796de0d8c778f06a02"
"sha256:05567d8aba344826060481ea309555432c96f0febe22bee7cf5a3b6d3a03cec8",
"sha256:187da93aec3f2e87d8a31eced16fa2cb9c71fe2d69b0a797f9f7a9220f5bf7ae"
],
"markers": "python_version >= '3.8'",
"version": "==1.34.45"
"version": "==1.34.52"
},
"brotli": {
"hashes": [
@@ -273,7 +273,7 @@
"sha256:fd5f17ff8f14003595ab414e45fce13d073e0762394f957182e69035c9f3d7c2",
"sha256:fdc3ff3bfccdc6b9cc7c342c03aa2400683f0cb891d46e94b64a197910dc4064"
],
"markers": "platform_python_implementation >= 'CPython'",
"markers": "implementation_name == 'cpython'",
"version": "==1.1.0"
},
"bs4": {
@@ -286,11 +286,11 @@
},
"cachetools": {
"hashes": [
"sha256:086ee420196f7b2ab9ca2db2520aca326318b68fe5ba8bc4d49cca91add450f2",
"sha256:861f35a13a451f94e301ce2bec7cac63e881232ccce7ed67fab9b5df4d3beaa1"
"sha256:0abad1021d3f8325b2fc1d2e9c8b9c9d57b04c3932657a72465447332c24d945",
"sha256:ba29e2dfa0b8b556606f097407ed1aa62080ee108ab0dc5ec9d6a723a007d105"
],
"markers": "python_version >= '3.7'",
"version": "==5.3.2"
"version": "==5.3.3"
},
"certifi": {
"hashes": [
@@ -479,42 +479,42 @@
},
"cryptography": {
"hashes": [
"sha256:04859aa7f12c2b5f7e22d25198ddd537391f1695df7057c8700f71f26f47a129",
"sha256:069d2ce9be5526a44093a0991c450fe9906cdf069e0e7cd67d9dee49a62b9ebe",
"sha256:0d3ec384058b642f7fb7e7bff9664030011ed1af8f852540c76a1317a9dd0d20",
"sha256:0fab2a5c479b360e5e0ea9f654bcebb535e3aa1e493a715b13244f4e07ea8eec",
"sha256:0fea01527d4fb22ffe38cd98951c9044400f6eff4788cf52ae116e27d30a1ba3",
"sha256:1b797099d221df7cce5ff2a1d272761d1554ddf9a987d3e11f6459b38cd300fd",
"sha256:1e935c2900fb53d31f491c0de04f41110351377be19d83d908c1fd502ae8daa5",
"sha256:20100c22b298c9eaebe4f0b9032ea97186ac2555f426c3e70670f2517989543b",
"sha256:20180da1b508f4aefc101cebc14c57043a02b355d1a652b6e8e537967f1e1b46",
"sha256:25b09b73db78facdfd7dd0fa77a3f19e94896197c86e9f6dc16bce7b37a96504",
"sha256:2619487f37da18d6826e27854a7f9d4d013c51eafb066c80d09c63cf24505306",
"sha256:2eb6368d5327d6455f20327fb6159b97538820355ec00f8cc9464d617caecead",
"sha256:35772a6cffd1f59b85cb670f12faba05513446f80352fe811689b4e439b5d89e",
"sha256:39d5c93e95bcbc4c06313fc6a500cee414ee39b616b55320c1904760ad686938",
"sha256:3d96ea47ce6d0055d5b97e761d37b4e84195485cb5a38401be341fabf23bc32a",
"sha256:4dcab7c25e48fc09a73c3e463d09ac902a932a0f8d0c568238b3696d06bf377b",
"sha256:5fbf0f3f0fac7c089308bd771d2c6c7b7d53ae909dce1db52d8e921f6c19bb3a",
"sha256:6c25e1e9c2ce682d01fc5e2dde6598f7313027343bd14f4049b82ad0402e52cd",
"sha256:762f3771ae40e111d78d77cbe9c1035e886ac04a234d3ee0856bf4ecb3749d54",
"sha256:90147dad8c22d64b2ff7331f8d4cddfdc3ee93e4879796f837bdbb2a0b141e0c",
"sha256:935cca25d35dda9e7bd46a24831dfd255307c55a07ff38fd1a92119cffc34857",
"sha256:93fbee08c48e63d5d1b39ab56fd3fdd02e6c2431c3da0f4edaf54954744c718f",
"sha256:9541c69c62d7446539f2c1c06d7046aef822940d248fa4b8962ff0302862cc1f",
"sha256:c23f03cfd7d9826cdcbad7850de67e18b4654179e01fe9bc623d37c2638eb4ef",
"sha256:c3d1f5a1d403a8e640fa0887e9f7087331abb3f33b0f2207d2cc7f213e4a864c",
"sha256:d1998e545081da0ab276bcb4b33cce85f775adb86a516e8f55b3dac87f469548",
"sha256:d5cf11bc7f0b71fb71af26af396c83dfd3f6eed56d4b6ef95d57867bf1e4ba65",
"sha256:db0480ffbfb1193ac4e1e88239f31314fe4c6cdcf9c0b8712b55414afbf80db4",
"sha256:de4ae486041878dc46e571a4c70ba337ed5233a1344c14a0790c4c4be4bbb8b4",
"sha256:de5086cd475d67113ccb6f9fae6d8fe3ac54a4f9238fd08bfdb07b03d791ff0a",
"sha256:df34312149b495d9d03492ce97471234fd9037aa5ba217c2a6ea890e9166f151",
"sha256:ead69ba488f806fe1b1b4050febafdbf206b81fa476126f3e16110c818bac396"
"sha256:0270572b8bd2c833c3981724b8ee9747b3ec96f699a9665470018594301439ee",
"sha256:111a0d8553afcf8eb02a4fea6ca4f59d48ddb34497aa8706a6cf536f1a5ec576",
"sha256:16a48c23a62a2f4a285699dba2e4ff2d1cff3115b9df052cdd976a18856d8e3d",
"sha256:1b95b98b0d2af784078fa69f637135e3c317091b615cd0905f8b8a087e86fa30",
"sha256:1f71c10d1e88467126f0efd484bd44bca5e14c664ec2ede64c32f20875c0d413",
"sha256:2424ff4c4ac7f6b8177b53c17ed5d8fa74ae5955656867f5a8affaca36a27abb",
"sha256:2bce03af1ce5a5567ab89bd90d11e7bbdff56b8af3acbbec1faded8f44cb06da",
"sha256:329906dcc7b20ff3cad13c069a78124ed8247adcac44b10bea1130e36caae0b4",
"sha256:37dd623507659e08be98eec89323469e8c7b4c1407c85112634ae3dbdb926fdd",
"sha256:3eaafe47ec0d0ffcc9349e1708be2aaea4c6dd4978d76bf6eb0cb2c13636c6fc",
"sha256:5e6275c09d2badf57aea3afa80d975444f4be8d3bc58f7f80d2a484c6f9485c8",
"sha256:6fe07eec95dfd477eb9530aef5bead34fec819b3aaf6c5bd6d20565da607bfe1",
"sha256:7367d7b2eca6513681127ebad53b2582911d1736dc2ffc19f2c3ae49997496bc",
"sha256:7cde5f38e614f55e28d831754e8a3bacf9ace5d1566235e39d91b35502d6936e",
"sha256:9481ffe3cf013b71b2428b905c4f7a9a4f76ec03065b05ff499bb5682a8d9ad8",
"sha256:98d8dc6d012b82287f2c3d26ce1d2dd130ec200c8679b6213b3c73c08b2b7940",
"sha256:a011a644f6d7d03736214d38832e030d8268bcff4a41f728e6030325fea3e400",
"sha256:a2913c5375154b6ef2e91c10b5720ea6e21007412f6437504ffea2109b5a33d7",
"sha256:a30596bae9403a342c978fb47d9b0ee277699fa53bbafad14706af51fe543d16",
"sha256:b03c2ae5d2f0fc05f9a2c0c997e1bc18c8229f392234e8a0194f202169ccd278",
"sha256:b6cd2203306b63e41acdf39aa93b86fb566049aeb6dc489b70e34bcd07adca74",
"sha256:b7ffe927ee6531c78f81aa17e684e2ff617daeba7f189f911065b2ea2d526dec",
"sha256:b8cac287fafc4ad485b8a9b67d0ee80c66bf3574f655d3b97ef2e1082360faf1",
"sha256:ba334e6e4b1d92442b75ddacc615c5476d4ad55cc29b15d590cc6b86efa487e2",
"sha256:ba3e4a42397c25b7ff88cdec6e2a16c2be18720f317506ee25210f6d31925f9c",
"sha256:c41fb5e6a5fe9ebcd58ca3abfeb51dffb5d83d6775405305bfa8715b76521922",
"sha256:cd2030f6650c089aeb304cf093f3244d34745ce0cfcc39f20c6fbfe030102e2a",
"sha256:cd65d75953847815962c84a4654a84850b2bb4aed3f26fadcc1c13892e1e29f6",
"sha256:e4985a790f921508f36f81831817cbc03b102d643b5fcb81cd33df3fa291a1a1",
"sha256:e807b3188f9eb0eaa7bbb579b462c5ace579f1cedb28107ce8b48a9f7ad3679e",
"sha256:f12764b8fffc7a123f641d7d049d382b73f96a34117e0b637b80643169cec8ac",
"sha256:f8837fe1d6ac4a8052a9a8ddab256bc006242696f03368a4009be7ee3075cdb7"
],
"index": "pypi",
"markers": "python_version >= '3.7'",
"version": "==42.0.3"
"version": "==42.0.5"
},
"dataclasses-json": {
"hashes": [
@@ -651,10 +651,11 @@
},
"future": {
"hashes": [
"sha256:34a17436ed1e96697a86f9de3d15a3b0be01d8bc8de9c1dffd59fb8234ed5307"
"sha256:929292d34f5872e70396626ef385ec22355a1fae8ad29e1a734c3e43f9fbc216",
"sha256:bd2968309307861edae1458a4f8a4f3598c03be43b97521076aebf5d94c07b05"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.18.3"
"version": "==1.0.0"
},
"google-api-core": {
"hashes": [
@@ -666,20 +667,20 @@
},
"google-api-python-client": {
"hashes": [
"sha256:9d83b178496b180e058fd206ebfb70ea1afab49f235dd326f557513f56f496d5",
"sha256:ebf4927a3f5184096647be8f705d090e7f06d48ad82b0fa431a2fe80c2cbe182"
"sha256:84e43bdb58dd8d2301669513863996378ffe9a3bf6d23b5ccd4f1e021323dbeb",
"sha256:ff9ef7539eaf7e088a481b25d1af4704210b07863e1d51b5ee498b910a3a46a3"
],
"index": "pypi",
"markers": "python_version >= '3.7'",
"version": "==2.118.0"
"version": "==2.119.0"
},
"google-auth": {
"hashes": [
"sha256:3cfc1b6e4e64797584fb53fc9bd0b7afa9b7c0dba2004fa7dcc9349e58cc3195",
"sha256:7634d29dcd1e101f5226a23cbc4a0c6cda6394253bf80e281d9c5c6797869c53"
"sha256:25141e2d7a14bfcba945f5e9827f98092716e99482562f15306e5b026e21aa72",
"sha256:34fc3046c257cedcf1622fc4b31fc2be7923d9b4d44973d481125ecc50d83885"
],
"markers": "python_version >= '3.7'",
"version": "==2.28.0"
"version": "==2.28.1"
},
"google-auth-httplib2": {
"hashes": [
@@ -725,11 +726,11 @@
},
"httpcore": {
"hashes": [
"sha256:5c0f9546ad17dac4d0772b0808856eb616eb8b48ce94f49ed819fd6982a8a544",
"sha256:9a6a501c3099307d9fd76ac244e08503427679b1e81ceb1d922485e2f2462ad2"
"sha256:ac418c1db41bade2ad53ae2f3834a3a0f5ae76b56cf5aa497d2d033384fc7d73",
"sha256:cb2839ccfcba0d2d3c1131d3c3e26dfc327326fbe7a5dc0dbfe9f6c9151bb022"
],
"markers": "python_version >= '3.8'",
"version": "==1.0.3"
"version": "==1.0.4"
},
"httplib2": {
"hashes": [
@@ -741,11 +742,11 @@
},
"httpx": {
"hashes": [
"sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf",
"sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"
"sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5",
"sha256:a0cb88a46f32dc874e04ee956e4c2764aba2aa228f650b06788ba6bda2962ab5"
],
"markers": "python_version >= '3.8'",
"version": "==0.26.0"
"version": "==0.27.0"
},
"idna": {
"hashes": [
@@ -966,11 +967,11 @@
},
"marshmallow": {
"hashes": [
"sha256:4c1daff273513dc5eb24b219a8035559dc573c8f322558ef85f5438ddd1236dd",
"sha256:c21d4b98fee747c130e6bc8f45c4b3199ea66bc00c12ee1f639f0aeca034d5e9"
"sha256:20f53be28c6e374a711a16165fb22a8dc6003e3f7cda1285e3ca777b9193885b",
"sha256:e7997f83571c7fd476042c2c188e4ee8a78900ca5e74bd9c8097afa56624e9bd"
],
"markers": "python_version >= '3.8'",
"version": "==3.20.2"
"version": "==3.21.0"
},
"mdurl": {
"hashes": [
@@ -1648,11 +1649,11 @@
},
"rich": {
"hashes": [
"sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa",
"sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235"
"sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222",
"sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432"
],
"markers": "python_full_version >= '3.7.0'",
"version": "==13.7.0"
"version": "==13.7.1"
},
"rsa": {
"hashes": [
@@ -1689,11 +1690,11 @@
},
"sniffio": {
"hashes": [
"sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101",
"sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"
"sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2",
"sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"
],
"markers": "python_version >= '3.7'",
"version": "==1.3.0"
"version": "==1.3.1"
},
"snscrape": {
"hashes": [
@@ -1783,11 +1784,11 @@
},
"typing-extensions": {
"hashes": [
"sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783",
"sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd"
"sha256:69b1a937c3a517342112fb4c6df7e72fc39a38e7891a5730ed4985b5214b5475",
"sha256:b0abd7c89e8fb96f98db18d86106ff1d90ab692004eb746cf6eda2682f91b3cb"
],
"markers": "python_version >= '3.8'",
"version": "==4.9.0"
"version": "==4.10.0"
},
"typing-inspect": {
"hashes": [
@@ -1922,7 +1923,7 @@
"sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8",
"sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"
],
"markers": "python_version >= '3.7'",
"markers": "python_version >= '3.8'",
"version": "==12.0"
},
"werkzeug": {

View File

@@ -7,6 +7,7 @@ steps:
# - telegram_archiver
# - twitter_archiver
# - twitter_api_archiver
# - instagram_api_archiver
# - instagram_tbot_archiver
# - instagram_archiver
# - tiktok_archiver

View File

@@ -22,6 +22,7 @@ class InstagramAPIArchiver(Archiver):
super().__init__(config)
self.assert_valid_string("access_token")
self.assert_valid_string("api_endpoint")
self.full_profile_max_posts = int(self.full_profile_max_posts)
if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1]
self.full_profile = bool(self.full_profile)
@@ -33,6 +34,7 @@ class InstagramAPIArchiver(Archiver):
"access_token": {"default": None, "help": "a valid instagrapi-api token"},
"api_endpoint": {"default": None, "help": "API endpoint to use"},
"full_profile": {"default": False, "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information."},
"full_profile_max_posts": {"default": 0, "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights"},
"minimize_json_output": {"default": True, "help": "if true, will remove empty values from the json output"},
}
@@ -73,9 +75,9 @@ class InstagramAPIArchiver(Archiver):
if type(d) == list: return [self.cleanup_dict(v) for v in d]
if type(d) != dict: return d
return {
k: self.cleanup_dict(v) if type(v) in [dict, list] else v
k: clean_v
for k, v in d.items()
if v not in [0.0, 0, [], {}, "", None, "null"] and
if (clean_v := self.cleanup_dict(v)) not in [0.0, 0, [], {}, "", None, "null"] and
k not in ["x", "y", "width", "height"]
}
@@ -93,9 +95,6 @@ class InstagramAPIArchiver(Archiver):
if self.full_profile:
user_id = user.get("pk")
# download all posts
self.download_all_posts(result, user_id)
# download all stories
try:
stories = self._download_stories_reusable(result, username)
@@ -104,25 +103,46 @@ class InstagramAPIArchiver(Archiver):
result.append("errors", f"Error downloading stories for {username}")
logger.error(f"Error downloading stories for {username}: {e}")
# download all posts
try:
self.download_all_posts(result, user_id)
except Exception as e:
result.append("errors", f"Error downloading posts for {username}")
logger.error(f"Error downloading posts for {username}: {e}")
# download all tagged
try:
self.download_all_tagged(result, user_id)
except Exception as e:
result.append("errors", f"Error downloading tagged posts for {username}")
logger.error(f"Error downloading tagged posts for {username}: {e}")
# download all highlights
try:
count_highlights = 0
highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
for h in highlights:
try:
h_info = self._download_highlights_reusable(result, h.get("pk"))
count_highlights += len(h_info.get("items", []))
except Exception as e:
result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}")
logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
result.set("#highlights", count_highlights)
self.download_all_highlights(result, username, user_id)
except Exception as e:
result.append("errors", f"Error downloading highlights for {username}")
logger.error(f"Error downloading highlights for {username}: {e}")
result.set_url(url) # reset as scrape_item modifies it
return result.success("insta profile")
def download_all_highlights(self, result, username, user_id):
count_highlights = 0
highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
for h in highlights:
try:
h_info = self._download_highlights_reusable(result, h.get("pk"))
count_highlights += len(h_info.get("items", []))
except Exception as e:
result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}")
logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
if self.full_profile_max_posts and count_highlights >= self.full_profile_max_posts:
logger.info(f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}")
break
result.set("#highlights", count_highlights)
def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata:
if id:
post = self.call_api(f"v1/media/by/id", {"id": id})
@@ -166,12 +186,13 @@ class InstagramAPIArchiver(Archiver):
def download_stories(self, result: Metadata, username: str) -> Metadata:
now = datetime.now().strftime("%Y-%m-%d_%H-%M")
stories = self._download_stories_reusable(result, username)
if stories == []: return result.success("insta no story")
result.set_title(f"stories {username} at {now}").set("#stories", len(stories))
return result.success(f"insta stories {now}")
def _download_stories_reusable(self, result: Metadata, username: str) -> list[dict]:
stories = self.call_api(f"v1/user/stories/by/username", {"username": username})
assert stories, f"Stories for {username} not found"
if not stories or not len(stories): return []
stories = stories[::-1] # newest to oldest
for s in tqdm(stories, desc="downloading stories", unit="story"):
@@ -188,7 +209,7 @@ class InstagramAPIArchiver(Archiver):
post_count = 0
while end_cursor != "":
posts = self.call_api(f"v1/user/medias/chunk", {"user_id": user_id, "end_cursor": end_cursor})
if not len(posts): break
if not len(posts) or not type(posts) == list or len(posts) != 2: break
posts, end_cursor = posts[0], posts[1]
logger.info(f"parsing {len(posts)} posts, next {end_cursor=}")
@@ -199,7 +220,35 @@ class InstagramAPIArchiver(Archiver):
logger.error(f"Error downloading post, skipping {p.get('id')}: {e}")
pbar.update(1)
post_count+=1
if self.full_profile_max_posts and post_count >= self.full_profile_max_posts:
logger.info(f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}")
break
result.set("#posts", post_count)
def download_all_tagged(self, result: Metadata, user_id: str):
next_page_id = ""
pbar = tqdm(desc="downloading tagged posts")
tagged_count = 0
while next_page_id != None:
resp = self.call_api(f"v2/user/tag/medias", {"user_id": user_id, "page_id": next_page_id})
posts = resp.get("response", {}).get("items", [])
if not len(posts): break
next_page_id = resp.get("next_page_id")
logger.info(f"parsing {len(posts)} tagged posts, next {next_page_id=}")
for p in posts:
try: self.scrape_item(result, p, "tagged")
except Exception as e:
result.append("errors", f"Error downloading tagged post {p.get('id')}")
logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e}")
pbar.update(1)
tagged_count+=1
if self.full_profile_max_posts and tagged_count >= self.full_profile_max_posts:
logger.info(f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}")
break
result.set("#tagged", tagged_count)
### reusable parsing utils below
@@ -217,10 +266,10 @@ class InstagramAPIArchiver(Archiver):
if self.minimize_json_output:
del item["clips_metadata"]
if code := item.get("code"):
result.set("url", f"https://www.instagram.com/p/{code}/")
if code := item.get("code") and not result.get("url"):
result.set_url(f"https://www.instagram.com/p/{code}/")
resources = item.get("resources", [])
resources = item.get("resources", item.get("carousel_media", []))
item, media, media_id = self.scrape_media(item, context)
# if resources are present take the main media from the first resource
if not media and len(resources):
@@ -242,7 +291,7 @@ class InstagramAPIArchiver(Archiver):
def scrape_media(self, item: dict, context:str) -> tuple[dict, Media, str]:
# remove unnecessary info
if self.minimize_json_output:
for k in ["image_versions", "video_versions", "video_dash_manifest"]:
for k in ["image_versions", "video_versions", "video_dash_manifest", "image_versions2", "video_versions2"]:
if k in item: del item[k]
item = self.cleanup_dict(item)
@@ -253,19 +302,24 @@ class InstagramAPIArchiver(Archiver):
# retrieve video info
best_id = item.get('id', item.get('pk'))
taken_at = item.get("taken_at")
taken_at = item.get("taken_at", item.get("taken_at_ts"))
code = item.get("code")
caption_text = item.get("caption_text")
if "carousel_media" in item: del item["carousel_media"]
if video_url := item.get("video_url"):
filename = self.download_from_url(video_url, verbose=False)
video_media = Media(filename=filename)
if taken_at: video_media.set("date", taken_at)
if code: video_media.set("url", f"https://www.instagram.com/p/{code}")
if caption_text: video_media.set("text", caption_text)
video_media.set("preview", [image_media])
video_media.set("data", [item])
return item, video_media, f"{context or 'video'} {best_id}"
elif image_media:
if taken_at: image_media.set("date", taken_at)
if code: image_media.set("url", f"https://www.instagram.com/p/{code}")
if caption_text: image_media.set("text", caption_text)
image_media.set("data", [item])
return item, image_media, f"{context or 'image'} {best_id}"

View File

@@ -42,7 +42,7 @@ class InstagramTbotArchiver(Archiver):
# make a copy of the session that is used exclusively with this archiver instance
new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
shutil.copy(self.session_file + ".session", new_session_file)
self.session_file = new_session_file
self.session_file = new_session_file.replace(".session", "")
try:
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)

View File

@@ -49,7 +49,7 @@ class TelethonArchiver(Archiver):
# make a copy of the session that is used exclusively with this archiver instance
new_session_file = os.path.join("secrets/", f"telethon-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
shutil.copy(self.session_file + ".session", new_session_file)
self.session_file = new_session_file
self.session_file = new_session_file.replace(".session", "")
# initiate the client
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)

View File

@@ -15,6 +15,8 @@ class YoutubeDLArchiver(Archiver):
self.livestreams = bool(self.livestreams)
self.live_from_start = bool(self.live_from_start)
self.end_means_success = bool(self.end_means_success)
self.allow_playlist = bool(self.allow_playlist)
self.max_downloads = self.max_downloads
@staticmethod
def configs() -> dict:
@@ -26,6 +28,8 @@ class YoutubeDLArchiver(Archiver):
"live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
"proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
"end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
"max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
}
def download(self, item: Metadata) -> Metadata:
@@ -35,11 +39,11 @@ class YoutubeDLArchiver(Archiver):
logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': True, 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy}
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
try:
# don'd download since it can be a live stream
# don't download since it can be a live stream
info = ydl.extract_info(url, download=False)
if info.get('is_live', False) and not self.livestreams:
logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
@@ -52,7 +56,8 @@ class YoutubeDLArchiver(Archiver):
return False
# this time download
ydl = yt_dlp.YoutubeDL({**ydl_options, "getcomments": self.comments})
ydl = yt_dlp.YoutubeDL({**ydl_options, "getcomments": self.comments})
#TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
info = ydl.extract_info(url, download=True)
if "entries" in info:
@@ -64,13 +69,17 @@ class YoutubeDLArchiver(Archiver):
result = Metadata()
result.set_title(info.get("title"))
if "description" in info: result.set_content(info["description"])
for entry in entries:
try:
filename = ydl.prepare_filename(entry)
if not os.path.exists(filename):
filename = filename.split('.')[0] + '.mkv'
new_media = Media(filename).set("duration", info.get("duration"))
new_media = Media(filename)
for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
if x in entry: new_media.set(x, entry[x])
# read text from subtitles if enabled
if self.subtitles:
for lang, val in (info.get('requested_subtitles') or {}).items():

View File

@@ -1,5 +1,7 @@
from __future__ import annotations
from typing import Generator, Union, List
from urllib.parse import urlparse
from ipaddress import ip_address
from .context import ArchivingContext
@@ -60,7 +62,9 @@ class ArchivingOrchestrator:
exit()
except Exception as e:
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
for d in self.databases: d.failed(item)
for d in self.databases:
if type(e) == AssertionError: d.failed(item, str(e))
else: d.failed(item)
def archive(self, result: Metadata) -> Union[Metadata, None]:
@@ -73,7 +77,8 @@ class ArchivingOrchestrator:
5. Store all downloaded/generated media
6. Call selected Formatter and store formatted if needed
"""
original_url = result.get_url()
original_url = result.get_url().strip()
self.assert_valid_url(original_url)
# 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
url = original_url
@@ -90,7 +95,9 @@ class ArchivingOrchestrator:
if cached_result:
logger.debug("Found previously archived entry")
for d in self.databases:
d.done(cached_result, cached=True)
try: d.done(cached_result, cached=True)
except Exception as e:
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
return cached_result
# 3 - call archivers until one succeeds
@@ -120,6 +127,29 @@ class ArchivingOrchestrator:
result.status = "nothing archived"
# signal completion to databases and archivers
for d in self.databases: d.done(result)
for d in self.databases:
try: d.done(result)
except Exception as e:
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
return result
def assert_valid_url(self, url: str) -> bool:
"""
Blocks localhost, private, reserved, and link-local IPs and all non-http/https schemes.
"""
assert url.startswith("http://") or url.startswith("https://"), f"Invalid URL scheme"
parsed = urlparse(url)
assert parsed.scheme in ["http", "https"], f"Invalid URL scheme"
assert parsed.hostname, f"Invalid URL hostname"
assert parsed.hostname != "localhost", f"Invalid URL"
try: # special rules for IP addresses
ip = ip_address(parsed.hostname)
except ValueError: pass
else:
assert ip.is_global, f"Invalid IP used"
assert not ip.is_reserved, f"Invalid IP used"
assert not ip.is_link_local, f"Invalid IP used"
assert not ip.is_private, f"Invalid IP used"

View File

@@ -23,8 +23,7 @@ class AAApiDb(Database):
def configs() -> dict:
return {
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
"api_secret": {"default": None, "help": "API Basic authentication secret [deprecating soon]"},
"api_token": {"default": None, "help": "API Bearer token, to be preferred over secret (Basic auth) going forward"},
"api_token": {"default": None, "help": "API Bearer token."},
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
"author_id": {"default": None, "help": "which email to assign as author"},
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
@@ -59,7 +58,7 @@ class AAApiDb(Database):
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
headers = {"Authorization": f"Bearer {self.api_secret}"}
headers = {"Authorization": f"Bearer {self.api_token}"}
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
if response.status_code == 200:

View File

@@ -21,8 +21,8 @@ class ConsoleDb(Database):
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")
def failed(self, item: Metadata) -> None:
logger.error(f"FAILED {item}")
def failed(self, item: Metadata, reason:str) -> None:
logger.error(f"FAILED {item}: {reason}")
def aborted(self, item: Metadata) -> None:
logger.warning(f"ABORTED {item}")

View File

@@ -22,7 +22,7 @@ class Database(Step, ABC):
"""signals the DB that the given item archival has started"""
pass
def failed(self, item: Metadata) -> None:
def failed(self, item: Metadata, reason:str) -> None:
"""update DB accordingly for failure"""
pass

View File

@@ -29,9 +29,9 @@ class GsheetsDb(Database):
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, 'status', 'Archive in progress')
def failed(self, item: Metadata) -> None:
def failed(self, item: Metadata, reason:str) -> None:
logger.error(f"FAILED {item}")
self._safe_status_update(item, 'Archive failed')
self._safe_status_update(item, f'Archive failed {reason}')
def aborted(self, item: Metadata) -> None:
logger.warning(f"ABORTED {item}")
@@ -102,6 +102,11 @@ class GsheetsDb(Database):
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet")
row: int = ArchivingContext.get("gsheet").get("row")
if gsheet := ArchivingContext.get("gsheet"):
gw: GWorksheet = gsheet.get("worksheet")
row: int = gsheet.get("row")
elif self.sheet_id:
print(self.sheet_id)
return gw, row

View File

@@ -27,7 +27,10 @@ class SSLEnricher(Enricher):
if not to_enrich.media and self.skip_when_nothing_archived: return
url = to_enrich.get_url()
domain = urlparse(url).netloc
parsed = urlparse(url)
assert parsed.scheme in ["https"], f"Invalid URL scheme {url=}"
domain = parsed.netloc
logger.debug(f"fetching SSL certificate for {domain=} in {url=}")
cert = ssl.get_server_certificate((domain, 443))

View File

@@ -35,6 +35,22 @@ class WaczArchiverEnricher(Enricher, Archiver):
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
}
def setup(self) -> None:
self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
self.docker_in_docker = os.environ.get('WACZ_ENABLE_DOCKER') and os.environ.get('RUNNING_IN_DOCKER')
self.cwd_dind = f"/crawls/crawls{random_str(8)}"
self.browsertrix_home_host = os.environ.get('BROWSERTRIX_HOME_HOST')
self.browsertrix_home_container = os.environ.get('BROWSERTRIX_HOME_CONTAINER') or self.browsertrix_home_host
# create crawls folder if not exists, so it can be safely removed in cleanup
if self.docker_in_docker:
os.makedirs(self.cwd_dind, exist_ok=True)
def cleanup(self) -> None:
if self.docker_in_docker:
logger.debug(f"Removing {self.cwd_dind=}")
shutil.rmtree(self.cwd_dind, ignore_errors=True)
def download(self, item: Metadata) -> Metadata:
# this new Metadata object is required to avoid duplication
@@ -51,8 +67,8 @@ class WaczArchiverEnricher(Enricher, Archiver):
url = to_enrich.get_url()
collection = random_str(8)
browsertrix_home_host = os.environ.get('BROWSERTRIX_HOME_HOST') or os.path.abspath(ArchivingContext.get_tmp_dir())
browsertrix_home_container = os.environ.get('BROWSERTRIX_HOME_CONTAINER') or browsertrix_home_host
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
cmd = [
"crawl",
@@ -67,11 +83,12 @@ class WaczArchiverEnricher(Enricher, Archiver):
"--behaviors", "autoscroll,autoplay,autofetch,siteSpecific",
"--behaviorTimeout", str(self.timeout),
"--timeout", str(self.timeout)]
if self.docker_in_docker:
cmd.extend(["--cwd", self.cwd_dind])
# call docker if explicitly enabled or we are running on the host (not in docker)
use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
if use_docker:
if self.use_docker:
logger.debug(f"generating WACZ in Docker for {url=}")
logger.debug(f"{browsertrix_home_host=} {browsertrix_home_container=}")
if self.docker_commands:
@@ -103,7 +120,10 @@ class WaczArchiverEnricher(Enricher, Archiver):
logger.error(f"WACZ generation failed: {e}")
return False
if use_docker:
if self.docker_in_docker:
wacz_fn = os.path.join(self.cwd_dind, "collections", collection, f"{collection}.wacz")
elif self.use_docker:
wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
else:
wacz_fn = os.path.join("collections", collection, f"{collection}.wacz")
@@ -116,7 +136,9 @@ class WaczArchiverEnricher(Enricher, Archiver):
if self.extract_media or self.extract_screenshot:
self.extract_media_from_wacz(to_enrich, wacz_fn)
if use_docker:
if self.docker_in_docker:
jsonl_fn = os.path.join(self.cwd_dind, "collections", collection, "pages", "pages.jsonl")
elif self.use_docker:
jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl")
else:
jsonl_fn = os.path.join("collections", collection, "pages", "pages.jsonl")

View File

@@ -21,7 +21,7 @@ class HtmlFormatter(Formatter):
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")))
self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
# JinjaHelper class static methods are added as filters
self.environment.filters.update({
k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)

View File

@@ -1,6 +1,6 @@
_MAJOR = "0"
_MINOR = "9"
_MINOR = "10"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "1"