diff --git a/docs/scripts/scripts.py b/docs/scripts/scripts.py index ca1d348..bfddd29 100644 --- a/docs/scripts/scripts.py +++ b/docs/scripts/scripts.py @@ -52,7 +52,7 @@ def generate_module_docs(): for type in manifest["type"]: modules_by_type.setdefault(type, []).append(module) - description = "\n".join(l.lstrip() for l in manifest["description"].split("\n")) + description = "\n".join(line.lstrip() for line in manifest["description"].split("\n")) types = ", ".join(type_color[t] for t in manifest["type"]) readme_str = f""" # {manifest["name"]} diff --git a/docs/source/installation/setup.md b/docs/source/installation/setup.md index e5c96a6..f5b6e9d 100644 --- a/docs/source/installation/setup.md +++ b/docs/source/installation/setup.md @@ -51,6 +51,7 @@ The invocations below will run the auto-archiver Docker image using a configurat docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver # uses the same configuration, but with the `gsheet_feeder`, a header on row 2 and with some different column names +# Note this expects you to have followed the [Google Sheets setup](how_to/google_sheets.md) and added your service_account.json to the `secrets/` folder # notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --feeders=gsheet_feeder --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}' # Runs auto-archiver for the first time, but in 'full' mode, enabling all modules to get a full settings file diff --git a/pyproject.toml b/pyproject.toml index 86c8db8..90ac56f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,7 +96,7 @@ markers = [ #exclude = ["docs"] line-length = 120 # Remove this for a more detailed lint report -output-format = "concise" +#output-format = "concise" [tool.ruff.lint] @@ -104,7 +104,7 @@ output-format = "concise" # I : isort # UP : upgrade, e.g. use fstrings # ANN : annotations -#extend-select = ["B"] +extend-select = ["B"] # Ignore unused imports as some are currently required for lazy loading # This can be removed for a `lint check` run which is manually reviewed diff --git a/scripts/create_update_gdrive_oauth_token.py b/scripts/create_update_gdrive_oauth_token.py index a57043e..edd2565 100644 --- a/scripts/create_update_gdrive_oauth_token.py +++ b/scripts/create_update_gdrive_oauth_token.py @@ -1,5 +1,6 @@ import os.path -import click, json +import click +import json from google.auth.transport.requests import Request from google.oauth2.credentials import Credentials diff --git a/src/auto_archiver/modules/tiktok_tikwm_extractor/tiktok_tikwm_extractor.py b/src/auto_archiver/modules/tiktok_tikwm_extractor/tiktok_tikwm_extractor.py index 8b07775..e7ed91a 100644 --- a/src/auto_archiver/modules/tiktok_tikwm_extractor/tiktok_tikwm_extractor.py +++ b/src/auto_archiver/modules/tiktok_tikwm_extractor/tiktok_tikwm_extractor.py @@ -14,7 +14,7 @@ class TiktokTikwmExtractor(Extractor): """ TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}" - def download(self, item: Metadata) -> Metadata: + def download(self, item: Metadata) -> bool | Metadata: url = item.get_url() if not re.match(TikTokIE._VALID_URL, url): diff --git a/tests/enrichers/test_meta_enricher.py b/tests/enrichers/test_meta_enricher.py index 906e629..fe0d737 100644 --- a/tests/enrichers/test_meta_enricher.py +++ b/tests/enrichers/test_meta_enricher.py @@ -1,4 +1,3 @@ -import datetime from datetime import datetime, timedelta, timezone import pytest diff --git a/tests/enrichers/test_screenshot_enricher.py b/tests/enrichers/test_screenshot_enricher.py index ee6c2a7..b86bb17 100644 --- a/tests/enrichers/test_screenshot_enricher.py +++ b/tests/enrichers/test_screenshot_enricher.py @@ -15,9 +15,9 @@ def mock_selenium_env(mocker): mock_which = mocker.patch("shutil.which") mock_driver_class = mocker.patch("auto_archiver.utils.webdriver.CookieSettingDriver") mock_binary_paths = mocker.patch("selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths") - mock_is_file = mocker.patch("pathlib.Path.is_file", return_value=True) + mocker.patch("pathlib.Path.is_file", return_value=True) mock_popen = mocker.patch("subprocess.Popen") - mock_is_connectable = mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True) + mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True) mock_firefox_options = mocker.patch("selenium.webdriver.FirefoxOptions") # Define side effect for `shutil.which` @@ -157,13 +157,12 @@ def test_pdf_creation(mocker, screenshot_enricher, metadata_with_video, mock_sel # Mock the print_page method to return base64-encoded content mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode("utf-8") # Patch functions with mocker - mock_os_path_join = mocker.patch("os.path.join", side_effect=lambda *args: f"{args[-1]}") - mock_random_str = mocker.patch( + mocker.patch("os.path.join", side_effect=lambda *args: f"{args[-1]}") + mocker.patch( "auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str", return_value="fixed123", ) mock_open = mocker.patch("builtins.open", new_callable=mocker.mock_open) - mock_log_error = mocker.patch("loguru.logger.error") screenshot_enricher.enrich(metadata_with_video) # Verify screenshot and PDF creation diff --git a/tests/extractors/test_tiktok_tikwm_extractor.py b/tests/extractors/test_tiktok_tikwm_extractor.py index e8ad8df..f675ac0 100644 --- a/tests/extractors/test_tiktok_tikwm_extractor.py +++ b/tests/extractors/test_tiktok_tikwm_extractor.py @@ -39,7 +39,7 @@ class TestTiktokTikwmExtractor(TestExtractorBase): mock_get, mock_logger = self.get_mockers(mocker) if valid_url: mock_get.return_value.status_code = 404 - assert self.extractor.download(make_item(url)) == False + assert self.extractor.download(make_item(url)) is False assert mock_get.call_count == int(valid_url) assert mock_logger.error.call_count == int(valid_url) @@ -47,7 +47,7 @@ class TestTiktokTikwmExtractor(TestExtractorBase): mock_get, mock_logger = self.get_mockers(mocker) mock_get.return_value.status_code = 200 mock_get.return_value.json.side_effect = ValueError - assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) == False + assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False mock_get.assert_called_once() mock_get.return_value.json.assert_called_once() mock_logger.error.assert_called_once() @@ -68,7 +68,7 @@ class TestTiktokTikwmExtractor(TestExtractorBase): mock_get, mock_logger = self.get_mockers(mocker) mock_get.return_value.status_code = 200 mock_get.return_value.json.return_value = response - assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) == False + assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False mock_get.assert_called_once() mock_get.return_value.json.assert_called_once() mock_logger.error.assert_called_once() @@ -86,7 +86,7 @@ class TestTiktokTikwmExtractor(TestExtractorBase): result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) if not has_vid: - assert result == False + assert result is False else: assert result.is_success() assert len(result.media) == 1 @@ -99,7 +99,7 @@ class TestTiktokTikwmExtractor(TestExtractorBase): else: mock_logger.error.assert_not_called() - def test_correct_extraction(self, mocker, make_item): + def test_correct_data_extracted(self, mocker, make_item): mock_get, _ = self.get_mockers(mocker) mock_get.return_value.status_code = 200 mock_get.return_value.json.return_value = {"msg": "success", "data": { diff --git a/tests/feeders/test_gsheet_feeder.py b/tests/feeders/test_gsheet_feeder.py index 0fc2681..bf34757 100644 --- a/tests/feeders/test_gsheet_feeder.py +++ b/tests/feeders/test_gsheet_feeder.py @@ -172,10 +172,10 @@ def test_should_process_sheet(setup_module, mocker): "block_worksheets": {"Sheet3"}, }, ) - assert gdb.should_process_sheet("TestSheet") == True - assert gdb.should_process_sheet("Sheet3") == False + assert gdb.should_process_sheet("TestSheet") is True + assert gdb.should_process_sheet("Sheet3") is False # False if allow_worksheets is set - assert gdb.should_process_sheet("AnotherSheet") == False + assert gdb.should_process_sheet("AnotherSheet") is False @pytest.mark.skip(reason="Requires a real connection") diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py index abf9763..9e27b3f 100644 --- a/tests/storages/test_S3_storage.py +++ b/tests/storages/test_S3_storage.py @@ -61,7 +61,7 @@ class TestS3Storage: media = Media("test.txt") assert self.storage.is_upload_needed(media) is True self.storage.random_no_duplicate = True - mock_calc_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value='beepboop123beepboop123beepboop123') + mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value='beepboop123beepboop123beepboop123') mock_file_in_folder = mocker.patch.object(self.storage, 'file_in_folder', return_value='existing_key.txt') assert self.storage.is_upload_needed(media) is False assert media.key == 'existing_key.txt' @@ -70,10 +70,10 @@ class TestS3Storage: def test_skips_upload_when_duplicate_exists(self, mocker): """Test that upload skips when file_in_folder finds existing object""" self.storage.random_no_duplicate = True - mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt") + mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt") media = Media("test.txt") media._key = "original_path.txt" - mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123") + mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123") assert self.storage.is_upload_needed(media) is False assert media.key == "existing_folder/existing_file.txt" assert media.get("previously archived") is True @@ -101,5 +101,5 @@ class TestS3Storage: ) def test_file_in_folder_exists(self, mocker): - mock_list_objects = mocker.patch.object(self.storage.s3, 'list_objects', return_value={'Contents': [{'Key': 'path/to/file.txt'}]}) + mocker.patch.object(self.storage.s3, 'list_objects', return_value={'Contents': [{'Key': 'path/to/file.txt'}]}) assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt' diff --git a/tests/storages/test_atlos_storage.py b/tests/storages/test_atlos_storage.py index 7268c8d..f273c7e 100644 --- a/tests/storages/test_atlos_storage.py +++ b/tests/storages/test_atlos_storage.py @@ -94,7 +94,6 @@ def test_upload_not_uploaded(tmp_path, atlos_storage: AtlosStorage, metadata: Me call_args = post_mock.call_args[0] assert call_args[0] == expected_endpoint call_kwargs = post_mock.call_args[1] - expected_headers = {"Authorization": f"Bearer {atlos_storage.api_token}"} expected_params = {"title": media.properties} assert call_kwargs["params"] == expected_params file_tuple = call_kwargs["files"]["file"] diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py index 1e418f0..08c516f 100644 --- a/tests/storages/test_gdrive_storage.py +++ b/tests/storages/test_gdrive_storage.py @@ -9,9 +9,8 @@ from tests.storages.test_storage_base import TestStorageBase @pytest.fixture -def gdrive_storage(setup_module, mocker): +def gdrive_storage(setup_module, mocker) -> GDriveStorage: module_name: str = "gdrive_storage" - storage: GDriveStorage config: dict = { "path_generator": "url", "filename_generator": "static", diff --git a/tests/test_config.py b/tests/test_config.py index 0de4f16..03b06e7 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -77,15 +77,15 @@ def test_merge_dicts(): def test_check_types(): - assert config.is_list_type([]) == True - assert config.is_list_type(()) == True - assert config.is_list_type(set()) == True - assert config.is_list_type({}) == False - assert config.is_list_type("") == False - assert config.is_dict_type({}) == True - assert config.is_dict_type(CommentedMap()) == True - assert config.is_dict_type([]) == False - assert config.is_dict_type("") == False + assert config.is_list_type([]) is True + assert config.is_list_type(()) is True + assert config.is_list_type(set()) is True + assert config.is_list_type({}) is False + assert config.is_list_type("") is False + assert config.is_dict_type({}) is True + assert config.is_dict_type(CommentedMap()) is True + assert config.is_dict_type([]) is False + assert config.is_dict_type("") is False def test_from_dot_notation(): diff --git a/tests/test_modules.py b/tests/test_modules.py index 1ff4f45..b6018da 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -9,10 +9,8 @@ def example_module(): import auto_archiver module_factory = ModuleFactory() - - previous_path = auto_archiver.modules.__path__ + # previous_path = auto_archiver.modules.__path__ auto_archiver.modules.__path__.append("tests/data/test_modules/") - return module_factory.get_module_lazy("example_module") @@ -84,6 +82,8 @@ def test_load_modules(module_name): # check that default settings are applied default_config = module.configs assert loaded_module.name in loaded_module.config.keys() + defaults = {k: v.get("default") for k, v in default_config.items()} + assert loaded_module.config[module_name] == defaults @pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"]) diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 64d84d8..a79aa70 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -67,7 +67,7 @@ def test_version(basic_parser, capsys): def test_help(orchestrator, basic_parser, capsys): args = basic_parser.parse_args(["--help"]) - assert args.help == True + assert args.help is True # test the show_help() on orchestrator with pytest.raises(SystemExit) as exit_error: @@ -116,8 +116,8 @@ def test_check_required_values(orchestrator, caplog, test_args): # drop the example_module.required_field from the test_args test_args = test_args[:-2] - with pytest.raises(SystemExit) as exit_error: - config = orchestrator.setup_config(test_args) + with pytest.raises(SystemExit): + orchestrator.setup_config(test_args) assert caplog.records[1].message == "the following arguments are required: --example_module.required_field" @@ -212,7 +212,7 @@ def test_multiple_orchestrator(test_args): ] o1 = ArchivingOrchestrator() - with pytest.raises(ValueError) as exit_error: + with pytest.raises(ValueError): # this should fail because the gsheet_feeder_db requires a sheet_id / sheet o1.setup(o1_args)