From a705a78632da1ca6d852dbe338c263a31943ee2a Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 3 Mar 2025 21:06:09 +0000 Subject: [PATCH 1/4] Fix instagram_extractor.py typo in config value. --- .../instagram_extractor.py | 28 +++++++++----- tests/extractors/test_instagram_extractor.py | 37 ++++++++++++------- 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index 0af2c32..7ae3b01 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -3,7 +3,9 @@ highlights, and tagged posts. Authentication is required via username/password or a session file. """ -import re, os, shutil, traceback +import re, os, shutil +from sys import exc_info + import instaloader from loguru import logger @@ -28,19 +30,27 @@ class InstagramExtractor(Extractor): def setup(self) -> None: self.insta = instaloader.Instaloader( - download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}" + download_geotags=True, + download_comments=True, + compress_json=False, + dirname_pattern=self.download_folder, + filename_pattern="{date_utc}_UTC_{target}__{typename}" ) try: self.insta.load_session_from_file(self.username, self.session_file) - except Exception as e: - logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}") + except FileNotFoundError: + logger.info("No existing session file found - Attempting login with use and password.") try: - self.insta.login(self.username, config.instagram_self.password) - # TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758 + self.insta.login(self.username, self.password) self.insta.save_session_to_file(self.session_file) - except Exception as e2: - logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}") - + except Exception as e: + logger.error(f"Failed to log in with Instaloader: {e}") + # TODO raise exception? + # raise Exception(f"Failed to log in with Instaloader: {e}") + except Exception as e: + logger.error(f"Error loading session file: {e}") + # TODO raise exception? + # raise Exception(f"Error loading session file: {e}") def download(self, item: Metadata) -> Metadata: diff --git a/tests/extractors/test_instagram_extractor.py b/tests/extractors/test_instagram_extractor.py index 7efe1b1..97549b8 100644 --- a/tests/extractors/test_instagram_extractor.py +++ b/tests/extractors/test_instagram_extractor.py @@ -3,19 +3,30 @@ import pytest from auto_archiver.modules.instagram_extractor import InstagramExtractor from .test_extractor_base import TestExtractorBase -class TestInstagramExtractor(TestExtractorBase): + +@pytest.fixture +def intsagram_extractor(setup_module): extractor_module: str = 'instagram_extractor' - config: dict = {} + config: dict = { + "username": "user_name", + "password": "password123", + "download_folder": "instaloader", + "session_file": "secrets/instaloader.session", + } + return setup_module(extractor_module, config) - @pytest.mark.parametrize("url", [ - "https://www.instagram.com/p/", - "https://www.instagram.com/p/1234567890/", - "https://www.instagram.com/reel/1234567890/", - "https://www.instagram.com/username/", - "https://www.instagram.com/username/stories/", - "https://www.instagram.com/username/highlights/", - ]) - def test_regex_matches(self, url): - # post - assert InstagramExtractor.valid_url.match(url) + + + +@pytest.mark.parametrize("url", [ + "https://www.instagram.com/p/", + "https://www.instagram.com/p/1234567890/", + "https://www.instagram.com/reel/1234567890/", + "https://www.instagram.com/username/", + "https://www.instagram.com/username/stories/", + "https://www.instagram.com/username/highlights/", +]) +def test_regex_matches(url, instagram_extractor): + # post + assert instagram_extractor.valid_url.match(url) From fa1e65f54c5bf8ee09b2e6fcec0e7c997cc6b97a Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 6 Mar 2025 16:25:38 +0000 Subject: [PATCH 2/4] Fix instagram_extractor.py typo, add warning to docs, and add basic regex test. --- .../instagram_extractor/__manifest__.py | 17 +++++++++++------ .../instagram_extractor/instagram_extractor.py | 18 +++++------------- tests/extractors/test_instagram_extractor.py | 18 +++++++++++------- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/src/auto_archiver/modules/instagram_extractor/__manifest__.py b/src/auto_archiver/modules/instagram_extractor/__manifest__.py index 05cae19..c9b479a 100644 --- a/src/auto_archiver/modules/instagram_extractor/__manifest__.py +++ b/src/auto_archiver/modules/instagram_extractor/__manifest__.py @@ -10,25 +10,30 @@ "requires_setup": True, "configs": { "username": {"required": True, - "help": "a valid Instagram username"}, + "help": "A valid Instagram username."}, "password": { "required": True, - "help": "the corresponding Instagram account password", + "help": "The corresponding Instagram account password.", }, "download_folder": { "default": "instaloader", - "help": "name of a folder to temporarily download content to", + "help": "Name of a folder to temporarily download content to.", }, "session_file": { "default": "secrets/instaloader.session", - "help": "path to the instagram session which saves session credentials", + "help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.", }, # TODO: fine-grain # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"}, }, "description": """ - Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts - and user profiles, downloading as much information as possible, including images, videos, text, stories, + Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. + + > ⚠️ **Warning** + > This module is not actively maintained due to known issues with blocking. + > Prioritise usage of the `instagram_tbot_extractor` and `instagram_api_extractor`. + + This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories, highlights, and tagged posts. Authentication is required via username/password or a session file. diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index 7ae3b01..7e195ad 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -4,8 +4,6 @@ """ import re, os, shutil -from sys import exc_info - import instaloader from loguru import logger @@ -17,10 +15,9 @@ class InstagramExtractor(Extractor): """ Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...) """ + # NB: post regex should be tested before profile - valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/") - # https://regex101.com/r/MGPquX/1 post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url)) # https://regex101.com/r/6Wbsxa/1 @@ -38,19 +35,14 @@ class InstagramExtractor(Extractor): ) try: self.insta.load_session_from_file(self.username, self.session_file) - except FileNotFoundError: - logger.info("No existing session file found - Attempting login with use and password.") + except Exception as e: try: + logger.debug(f"Session file failed", exc_info=True) + logger.info("No valid session file found - Attempting login with use and password.") self.insta.login(self.username, self.password) self.insta.save_session_to_file(self.session_file) except Exception as e: - logger.error(f"Failed to log in with Instaloader: {e}") - # TODO raise exception? - # raise Exception(f"Failed to log in with Instaloader: {e}") - except Exception as e: - logger.error(f"Error loading session file: {e}") - # TODO raise exception? - # raise Exception(f"Error loading session file: {e}") + logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}") def download(self, item: Metadata) -> Metadata: diff --git a/tests/extractors/test_instagram_extractor.py b/tests/extractors/test_instagram_extractor.py index 97549b8..647cab4 100644 --- a/tests/extractors/test_instagram_extractor.py +++ b/tests/extractors/test_instagram_extractor.py @@ -1,11 +1,10 @@ import pytest from auto_archiver.modules.instagram_extractor import InstagramExtractor -from .test_extractor_base import TestExtractorBase @pytest.fixture -def intsagram_extractor(setup_module): +def instagram_extractor(setup_module, mocker): extractor_module: str = 'instagram_extractor' config: dict = { @@ -14,11 +13,14 @@ def intsagram_extractor(setup_module): "download_folder": "instaloader", "session_file": "secrets/instaloader.session", } + fake_loader = mocker.MagicMock() + fake_loader.load_session_from_file.return_value = None + fake_loader.login.return_value = None + fake_loader.save_session_to_file.return_value = None + mocker.patch("instaloader.Instaloader", return_value=fake_loader,) return setup_module(extractor_module, config) - - @pytest.mark.parametrize("url", [ "https://www.instagram.com/p/", "https://www.instagram.com/p/1234567890/", @@ -27,6 +29,8 @@ def intsagram_extractor(setup_module): "https://www.instagram.com/username/stories/", "https://www.instagram.com/username/highlights/", ]) -def test_regex_matches(url, instagram_extractor): - # post - assert instagram_extractor.valid_url.match(url) +def test_regex_matches(url: str, instagram_extractor: InstagramExtractor) -> None: + """ + Ensure that the valid_url regex matches all provided Instagram URLs. + """ + assert instagram_extractor.valid_url.match(url) \ No newline at end of file From 89d2a8bb5477cfa6db57cefe1a9a7705385fdb45 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 7 Mar 2025 12:34:19 +0000 Subject: [PATCH 3/4] Update the __manifest__.py of the Instagram Extractor. --- src/auto_archiver/modules/instagram_extractor/__manifest__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/instagram_extractor/__manifest__.py b/src/auto_archiver/modules/instagram_extractor/__manifest__.py index c9b479a..c5d8a5a 100644 --- a/src/auto_archiver/modules/instagram_extractor/__manifest__.py +++ b/src/auto_archiver/modules/instagram_extractor/__manifest__.py @@ -31,7 +31,7 @@ > ⚠️ **Warning** > This module is not actively maintained due to known issues with blocking. - > Prioritise usage of the `instagram_tbot_extractor` and `instagram_api_extractor`. + > Prioritise usage of the [Instagram Tbot Extracto](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md) This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories, highlights, and tagged posts. From 4df03255a4fe39c7a1e20a546f70c3e8f31630b8 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 7 Mar 2025 14:56:35 +0000 Subject: [PATCH 4/4] Fix typo in __manifest__.py --- src/auto_archiver/modules/instagram_extractor/__manifest__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/instagram_extractor/__manifest__.py b/src/auto_archiver/modules/instagram_extractor/__manifest__.py index c5d8a5a..a66389f 100644 --- a/src/auto_archiver/modules/instagram_extractor/__manifest__.py +++ b/src/auto_archiver/modules/instagram_extractor/__manifest__.py @@ -31,7 +31,7 @@ > ⚠️ **Warning** > This module is not actively maintained due to known issues with blocking. - > Prioritise usage of the [Instagram Tbot Extracto](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md) + > Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md) This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories, highlights, and tagged posts.