diff --git a/.gitignore b/.gitignore index 4d19b9e..04b03ee 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,5 @@ gd-token.json credentials.json secrets/* browsertrix/* -browsertrix-tmp/* \ No newline at end of file +browsertrix-tmp/* +instaloader/* \ No newline at end of file diff --git a/Pipfile b/Pipfile index 88fad6a..aa04ea4 100644 --- a/Pipfile +++ b/Pipfile @@ -25,6 +25,10 @@ pyyaml = "*" dateparser = "*" vk-url-scraper = "*" python-twitter-v2 = "*" +instaloader = "*" [requires] python_version = "3.9" + +[dev-packages] +autopep8 = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 271a661..e2d1b1b 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "1ed953d08e31d891de0f887e520f12025d109a20718b27dd8f9b361f73c95651" + "sha256": "bd987e7237c7e32d2dffb295db633f5a022ce1a718435d11d8ac303c9e37a4d3" }, "pipfile-spec": 6, "requires": { @@ -29,7 +29,7 @@ "sha256:01c7bf666359b4967d2cda0000cc2e4af16a0ae098cbffcb8472fb9e8ad6585b", "sha256:6ebb3d106c12920aaae42ccb6f787ef5eefdcdd166ea3d628fa8476abe712144" ], - "markers": "python_full_version >= '3.5.0'", + "markers": "python_version >= '3.5'", "version": "==1.10" }, "attrs": { @@ -37,7 +37,7 @@ "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6", "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c" ], - "markers": "python_full_version >= '3.5.0'", + "markers": "python_version >= '3.5'", "version": "==22.1.0" }, "authlib": { @@ -159,7 +159,7 @@ "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d", "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2022.6.15" }, "cffi": { @@ -236,7 +236,7 @@ "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597", "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df" ], - "markers": "python_full_version >= '3.5.0'", + "markers": "python_version >= '3.5'", "version": "==2.0.12" }, "click": { @@ -290,7 +290,7 @@ "sha256:d4ef6cc305394ed669d4d9eebf10d3a101059bdcf2669c366ec1d14e4fb227bd", "sha256:d9e69ae01f99abe6ad646947bba8941e896cb3aa805be2597a0400e0764b5818" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==38.0.1" }, "dataclasses-json": { @@ -298,7 +298,7 @@ "sha256:bc285b5f892094c3a53d558858a88553dd6a61a11ab1a8128a0e554385dcc5dd", "sha256:c2c11bc8214fbf709ffc369d11446ff6945254a7f09128154a7620613d8fda90" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==0.5.7" }, "dateparser": { @@ -425,9 +425,16 @@ "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" ], - "markers": "python_full_version >= '3.5.0'", + "markers": "python_version >= '3.5'", "version": "==3.3" }, + "instaloader": { + "hashes": [ + "sha256:ba925a87e2c305a3d24173d1bb0457d5a7e2e77dbac7206eeeb46f9104ecb08e" + ], + "index": "pypi", + "version": "==4.9.5" + }, "itsdangerous": { "hashes": [ "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44", @@ -625,7 +632,7 @@ "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca", "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==3.2.2" }, "outcome": { @@ -641,7 +648,7 @@ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==21.3" }, "protobuf": { @@ -754,7 +761,7 @@ "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1", "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.13.0" }, "pyparsing": { @@ -786,7 +793,7 @@ "sha256:b7e3b04a59693c42c36f9ab1cc2acc46fa5df8c78e178fc33a8d4cd05c8d498f", "sha256:d92a187be61fe482e4fd675b6d52200e7be63a12b724abbf931a40ce4fa92938" ], - "markers": "python_full_version >= '3.5.0'", + "markers": "python_version >= '3.5'", "version": "==0.20.0" }, "python-slugify": { @@ -807,10 +814,9 @@ }, "pytz": { "hashes": [ - "sha256:2c0784747071402c6e99f0bafdb7da0fa22645f06554c7ae06bf6358897e9c91", - "sha256:48ce799d83b6f8aab2020e369b627446696619e79645419610b9facd909b3174" + "sha256:335ab46900b1465e714b4fda4963d87363264eb662aab5e65da039c25f1f5b22" ], - "version": "==2022.4" + "version": "==2022.5" }, "pytz-deprecation-shim": { "hashes": [ @@ -943,7 +949,7 @@ "sha256:fbc88d3ba402b5d041d204ec2449c4078898f89c4a6e6f0ed1c1a510ef1e221d", "sha256:fbd3fe37353c62fd0eb19fb76f78aa693716262bcd5f9c14bb9e5aca4b3f0dc4" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2022.3.2" }, "requests": { @@ -983,7 +989,7 @@ "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7", "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21" ], - "markers": "python_version >= '3.6' and python_version < '4'", + "markers": "python_version < '4' and python_full_version >= '3.6.0'", "version": "==4.9" }, "s3transfer": { @@ -1037,7 +1043,7 @@ "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.3.2.post1" }, "telethon": { @@ -1083,7 +1089,7 @@ "sha256:5b558f6e83cc20a37c3b61202476c5295d1addf57bd65543364e0337e37ed2bc", "sha256:a3d34de8fac26023eee701ed1e7bf4da9a8326b61a62934ec9e53b64970fd8fe" ], - "markers": "python_full_version >= '3.5.0'", + "markers": "python_version >= '3.5'", "version": "==0.9.2" }, "typing-extensions": { @@ -1106,7 +1112,7 @@ "sha256:323161b22b7802fdc78f20ca5f6073639c64f1a7227c40cd3e19fd1d0ce6650a", "sha256:e15b2b3005e2546108af42a0eb4ccab4d9e225e2dfbf4f77aad50c70a4b1f3ab" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2022.5" }, "tzlocal": { @@ -1114,7 +1120,7 @@ "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745", "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==4.2" }, "uritemplate": { @@ -1122,11 +1128,10 @@ "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0", "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==4.1.1" }, "urllib3": { - "extras": [], "hashes": [ "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14", "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e" @@ -1228,5 +1233,30 @@ "version": "==2022.5.18" } }, - "develop": {} + "develop": { + "autopep8": { + "hashes": [ + "sha256:6f09e90a2be784317e84dc1add17ebfc7abe3924239957a37e5040e27d812087", + "sha256:ca9b1a83e53a7fad65d731dc7a2a2d50aa48f43850407c59f6a1a306c4201142" + ], + "index": "pypi", + "version": "==1.7.0" + }, + "pycodestyle": { + "hashes": [ + "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785", + "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b" + ], + "markers": "python_version >= '3.6'", + "version": "==2.9.1" + }, + "toml": { + "hashes": [ + "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", + "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f" + ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.10.2" + } + } } diff --git a/archivers/__init__.py b/archivers/__init__.py index 403ebea..7f51e39 100644 --- a/archivers/__init__.py +++ b/archivers/__init__.py @@ -7,4 +7,5 @@ from .wayback_archiver import WaybackArchiver from .youtubedl_archiver import YoutubeDLArchiver from .twitter_archiver import TwitterArchiver from .vk_archiver import VkArchiver -from .twitter_api_archiver import TwitterApiArchiver \ No newline at end of file +from .twitter_api_archiver import TwitterApiArchiver +from .instagram_archiver import InstagramArchiver \ No newline at end of file diff --git a/archivers/instagram_archiver.py b/archivers/instagram_archiver.py new file mode 100644 index 0000000..1539527 --- /dev/null +++ b/archivers/instagram_archiver.py @@ -0,0 +1,128 @@ +import re, os, shutil, html +import instaloader # https://instaloader.github.io/as-module.html +from loguru import logger + +from .base_archiver import Archiver, ArchiveResult +from configs import Config +from storages import Storage + + +class InstagramArchiver(Archiver): + """ + Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ) + """ + name = "instagram" + DOWNLOAD_FOLDER = "instaloader" + # NB: post should be tested before profile + # https://regex101.com/r/MGPquX/1 + post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)") + # https://regex101.com/r/6Wbsxa/1 + profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)") + + def __init__(self, storage: Storage, config: Config): + super().__init__(storage, config) + self.insta = instaloader.Instaloader(download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.DOWNLOAD_FOLDER, filename_pattern="{date_utc}_UTC_{target}__{typename}") + if config.instagram_config: + self.insta.login(config.instagram_config.username, config.instagram_config.password) + + def download(self, url, check_if_exists=False): + post_matches = self.post_pattern.findall(url) + profile_matches = self.profile_pattern.findall(url) + + # return if not a valid instagram link + if not len(post_matches) and not len(profile_matches): + return + + # check if already uploaded + key = self.get_html_key(url) + if check_if_exists and self.storage.exists(key): + # only s3 storage supports storage.exists as not implemented on gd + cdn_url = self.storage.get_cdn_url(key) + screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) + return ArchiveResult(status='already archived', cdn_url=cdn_url, screenshot=screenshot, wacz=wacz) + + try: + # process if post + if len(post_matches): + return self.download_post(url, post_matches[0]) + + # process if profile + if len(profile_matches): + return self.download_profile(url, profile_matches[0]) + finally: + shutil.rmtree(self.DOWNLOAD_FOLDER, ignore_errors=True) + + def download_post(self, url, post_id): + logger.debug(f"Instagram {post_id=} detected in {url=}") + + post = instaloader.Post.from_shortcode(self.insta.context, post_id) + if self.insta.download_post(post, target=post.owner_username): + return self.upload_downloaded_content(url, post.title, post._asdict(), post.date) + + def download_profile(self, url, username): + # gets posts, posts where username is tagged, igtv postss, stories, and highlights + logger.debug(f"Instagram {username=} detected in {url=}") + + profile = instaloader.Profile.from_username(self.insta.context, username) + try: + for post in profile.get_posts(): + try: self.insta.download_post(post, target=f"profile_post_{post.owner_username}") + except Exception as e: logger.error(f"Failed to download post: {post.shortcode}: {e}") + except Exception as e: logger.error(f"Failed profile.get_posts: {e}") + + try: + for post in profile.get_tagged_posts(): + try: self.insta.download_post(post, target=f"tagged_post_{post.owner_username}") + except Exception as e: logger.error(f"Failed to download tagged post: {post.shortcode}: {e}") + except Exception as e: logger.error(f"Failed profile.get_tagged_posts: {e}") + + try: + for post in profile.get_igtv_posts(): + try: self.insta.download_post(post, target=f"igtv_post_{post.owner_username}") + except Exception as e: logger.error(f"Failed to download igtv post: {post.shortcode}: {e}") + except Exception as e: logger.error(f"Failed profile.get_igtv_posts: {e}") + + try: + for story in self.insta.get_stories([profile.userid]): + for item in story.get_items(): + try: self.insta.download_storyitem(item, target=f"story_item_{story.owner_username}") + except Exception as e: logger.error(f"Failed to download story item: {item}: {e}") + except Exception as e: logger.error(f"Failed get_stories: {e}") + + try: + for highlight in self.insta.get_highlights(profile.userid): + for item in highlight.get_items(): + try: self.insta.download_storyitem(item, target=f"highlight_item_{highlight.owner_username}") + except Exception as e: logger.error(f"Failed to download highlight item: {item}: {e}") + except Exception as e: logger.error(f"Failed get_highlights: {e}") + + return self.upload_downloaded_content(url, f"@{username}", profile._asdict(), None) + + def upload_downloaded_content(self, url, title, content, date): + status = "success" + try: + uploaded_media = [] + for f in os.listdir(self.DOWNLOAD_FOLDER): + if os.path.isfile((filename := os.path.join(self.DOWNLOAD_FOLDER, f))): + key = self.get_key(filename) + self.storage.upload(filename, key) + hash = self.get_hash(filename) + cdn_url = self.storage.get_cdn_url(key) + uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash}) + assert len(uploaded_media) > 1, "No uploaded media found" + + uploaded_media.sort(key=lambda m:m["key"], reverse=True) + + page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(content))) + except Exception as e: + logger.error(f"Could not fetch instagram post {url} due to: {e}") + status = "error" + finally: + shutil.rmtree(self.DOWNLOAD_FOLDER, ignore_errors=True) + + if status == "success": + screenshot = self.get_screenshot(url) + wacz = self.get_wacz(url) + + return ArchiveResult(status=status, cdn_url=page_cdn, title=title, timestamp=date, hash=page_hash, screenshot=screenshot, wacz=wacz) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 9f9bbbf..5c147de 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -1,6 +1,4 @@ -import os, re - -import html +import os, re, html from loguru import logger from telethon.sync import TelegramClient from telethon.errors import ChannelInvalidError diff --git a/auto_archive.py b/auto_archive.py index 50719a3..3412b0a 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -4,8 +4,8 @@ from loguru import logger from slugify import slugify from urllib.parse import quote -from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver -from utils import GWorksheet, mkdir_if_not_exists, expand_url +from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, InstagramArchiver, ArchiveResult, Archiver +from utils import GWorksheet, expand_url from configs import Config from storages import Storage @@ -111,6 +111,7 @@ def process_sheet(c: Config): TelethonArchiver(storage, c), TiktokArchiver(storage, c), TwitterApiArchiver(storage, c), + InstagramArchiver(storage, c), YoutubeDLArchiver(storage, c), TelegramArchiver(storage, c), TwitterArchiver(storage, c), diff --git a/configs/__init__.py b/configs/__init__.py index 6940ed3..1f01b62 100644 --- a/configs/__init__.py +++ b/configs/__init__.py @@ -3,4 +3,5 @@ from .selenium_config import SeleniumConfig from .telethon_config import TelethonConfig from .wayback_config import WaybackConfig from .twitter_api_config import TwitterApiConfig -from .vk_config import VkConfig \ No newline at end of file +from .vk_config import VkConfig +from .instagram_config import InstagramConfig \ No newline at end of file diff --git a/configs/config.py b/configs/config.py index 6e97dc4..80e4881 100644 --- a/configs/config.py +++ b/configs/config.py @@ -12,6 +12,7 @@ from .selenium_config import SeleniumConfig from .vk_config import VkConfig from .twitter_api_config import TwitterApiConfig from .browsertrix_config import BrowsertrixConfig +from .instagram_config import InstagramConfig from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig @@ -180,6 +181,16 @@ class Config: self.vk_config = None logger.debug(f"'vk' key not present in the {self.config_file=}") + # instagram config + if "instagram" in secrets: + self.instagram_config = InstagramConfig( + username=secrets["instagram"]["username"], + password=secrets["instagram"]["password"] + ) + else: + self.instagram_config = None + logger.debug(f"'instagram' key not present in the {self.config_file=}") + del self.config["secrets"] # delete to prevent leaks def set_log_files(self): diff --git a/configs/instagram_config.py b/configs/instagram_config.py new file mode 100644 index 0000000..73c45bc --- /dev/null +++ b/configs/instagram_config.py @@ -0,0 +1,8 @@ + +from dataclasses import dataclass + + +@dataclass +class InstagramConfig: + username: str + password: str diff --git a/example.config.yaml b/example.config.yaml index e42d10f..b26f58f 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -67,6 +67,11 @@ secrets: username: "phone number or email" password: "password" + # instagram credentials + instagram: + username: "username" + password: "password" + google_sheets: # local filename: defaults to service_account.json, see https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account service_account: "service_account.json"