diff --git a/src/auto_archiver/modules/instagram_extractor/__manifest__.py b/src/auto_archiver/modules/instagram_extractor/__manifest__.py index 05cae19..a66389f 100644 --- a/src/auto_archiver/modules/instagram_extractor/__manifest__.py +++ b/src/auto_archiver/modules/instagram_extractor/__manifest__.py @@ -10,25 +10,30 @@ "requires_setup": True, "configs": { "username": {"required": True, - "help": "a valid Instagram username"}, + "help": "A valid Instagram username."}, "password": { "required": True, - "help": "the corresponding Instagram account password", + "help": "The corresponding Instagram account password.", }, "download_folder": { "default": "instaloader", - "help": "name of a folder to temporarily download content to", + "help": "Name of a folder to temporarily download content to.", }, "session_file": { "default": "secrets/instaloader.session", - "help": "path to the instagram session which saves session credentials", + "help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.", }, # TODO: fine-grain # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"}, }, "description": """ - Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts - and user profiles, downloading as much information as possible, including images, videos, text, stories, + Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. + + > ⚠️ **Warning** + > This module is not actively maintained due to known issues with blocking. + > Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md) + + This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories, highlights, and tagged posts. Authentication is required via username/password or a session file. diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index 0af2c32..7e195ad 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -3,7 +3,7 @@ highlights, and tagged posts. Authentication is required via username/password or a session file. """ -import re, os, shutil, traceback +import re, os, shutil import instaloader from loguru import logger @@ -15,10 +15,9 @@ class InstagramExtractor(Extractor): """ Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...) """ + # NB: post regex should be tested before profile - valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/") - # https://regex101.com/r/MGPquX/1 post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url)) # https://regex101.com/r/6Wbsxa/1 @@ -28,19 +27,22 @@ class InstagramExtractor(Extractor): def setup(self) -> None: self.insta = instaloader.Instaloader( - download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}" + download_geotags=True, + download_comments=True, + compress_json=False, + dirname_pattern=self.download_folder, + filename_pattern="{date_utc}_UTC_{target}__{typename}" ) try: self.insta.load_session_from_file(self.username, self.session_file) except Exception as e: - logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}") try: - self.insta.login(self.username, config.instagram_self.password) - # TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758 + logger.debug(f"Session file failed", exc_info=True) + logger.info("No valid session file found - Attempting login with use and password.") + self.insta.login(self.username, self.password) self.insta.save_session_to_file(self.session_file) - except Exception as e2: - logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}") - + except Exception as e: + logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}") def download(self, item: Metadata) -> Metadata: diff --git a/tests/extractors/test_instagram_extractor.py b/tests/extractors/test_instagram_extractor.py index 7efe1b1..647cab4 100644 --- a/tests/extractors/test_instagram_extractor.py +++ b/tests/extractors/test_instagram_extractor.py @@ -1,21 +1,36 @@ import pytest from auto_archiver.modules.instagram_extractor import InstagramExtractor -from .test_extractor_base import TestExtractorBase -class TestInstagramExtractor(TestExtractorBase): + +@pytest.fixture +def instagram_extractor(setup_module, mocker): extractor_module: str = 'instagram_extractor' - config: dict = {} + config: dict = { + "username": "user_name", + "password": "password123", + "download_folder": "instaloader", + "session_file": "secrets/instaloader.session", + } + fake_loader = mocker.MagicMock() + fake_loader.load_session_from_file.return_value = None + fake_loader.login.return_value = None + fake_loader.save_session_to_file.return_value = None + mocker.patch("instaloader.Instaloader", return_value=fake_loader,) + return setup_module(extractor_module, config) - @pytest.mark.parametrize("url", [ - "https://www.instagram.com/p/", - "https://www.instagram.com/p/1234567890/", - "https://www.instagram.com/reel/1234567890/", - "https://www.instagram.com/username/", - "https://www.instagram.com/username/stories/", - "https://www.instagram.com/username/highlights/", - ]) - def test_regex_matches(self, url): - # post - assert InstagramExtractor.valid_url.match(url) + +@pytest.mark.parametrize("url", [ + "https://www.instagram.com/p/", + "https://www.instagram.com/p/1234567890/", + "https://www.instagram.com/reel/1234567890/", + "https://www.instagram.com/username/", + "https://www.instagram.com/username/stories/", + "https://www.instagram.com/username/highlights/", +]) +def test_regex_matches(url: str, instagram_extractor: InstagramExtractor) -> None: + """ + Ensure that the valid_url regex matches all provided Instagram URLs. + """ + assert instagram_extractor.valid_url.match(url) \ No newline at end of file