From 94e0803fb347cef7f7ead0d55b862accd6037201 Mon Sep 17 00:00:00 2001 From: mgaughan Date: Wed, 23 Jul 2025 14:22:11 -0400 Subject: [PATCH 1/3] implementing default metadata omission/user metadata selection --- src/auto_archiver/core/orchestrator.py | 8 +++ .../metadata_enricher/metadata_enricher.py | 38 ++++++++++++-- tests/enrichers/test_metadata_enricher.py | 49 ++++++++++++++++++- 3 files changed, 90 insertions(+), 5 deletions(-) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 9d914d1..27eb612 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -274,6 +274,14 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ default=False, ) + parser.add_argument( + "--metadata", + dest="requested_metadata", + help="An array of specific metadata fields to select from the collected content.", + default=[], + nargs="?", + ) + def add_individual_module_args( self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None ) -> None: diff --git a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py index 4ed47f3..c0a43bd 100644 --- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py +++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py @@ -11,19 +11,21 @@ class MetadataEnricher(Enricher): Extracts metadata information from files using exiftool. """ - def enrich(self, to_enrich: Metadata) -> None: + def enrich(self, to_enrich: Metadata, md_grocery_list=["author", "datetimes", "location"]) -> None: logger.debug("Extracting EXIF metadata") for i, m in enumerate(to_enrich.media): if len(md := self.get_metadata(m.filename)): - to_enrich.media[i].set("metadata", md) + # feature flag has this currently turned on + specified_md = self.select_metadata(md, md_grocery_list) + to_enrich.media[i].set("metadata", specified_md) + # to_enrich.media[i].set("metadata", md) def get_metadata(self, filename: str) -> dict: try: # Run ExifTool command to extract metadata from the file cmd = ["exiftool", filename] result = subprocess.run(cmd, capture_output=True, text=True) - # Process the output to extract individual metadata fields metadata = {} for line in result.stdout.splitlines(): @@ -35,3 +37,33 @@ class MetadataEnricher(Enricher): except Exception as e: logger.error(f"Error occurred: {e}: {traceback.format_exc()}") return {} + + def select_metadata(self, all_md, md_grocery_list): + """ + coordinates the selection of metadata from the general exiftool output to the user-specified grocery list + """ + # defining the batches of metadata that get pulled for special terms + author_key_terms = ["author", "producer", "creator"] + datetime_key_terms = ["date", "time"] + location_key_terms = ["gps", "latitude", "longitude"] + + specified_md = {} + for md_key in all_md.keys(): + md_key_lower = md_key.lower() + # checking for special baskets within the grocery list of requested metadata + if ("author" in md_grocery_list) and any( + term in md_key_lower and len(all_md[md_key]) for term in author_key_terms + ): + specified_md[md_key] = all_md[md_key] + if ("datetime" in md_grocery_list) and any( + term in md_key_lower and len(all_md[md_key]) for term in datetime_key_terms + ): + specified_md[md_key] = all_md[md_key] + if ("location" in md_grocery_list) and any( + term in md_key_lower and len(all_md[md_key]) for term in location_key_terms + ): + specified_md[md_key] = all_md[md_key] + # if the metadata value is requested directly + if md_key_lower in md_grocery_list or md_key in md_grocery_list and len(all_md[md_key]): + specified_md[md_key] = all_md[md_key] + return specified_md diff --git a/tests/enrichers/test_metadata_enricher.py b/tests/enrichers/test_metadata_enricher.py index a640920..224a0ba 100644 --- a/tests/enrichers/test_metadata_enricher.py +++ b/tests/enrichers/test_metadata_enricher.py @@ -49,13 +49,25 @@ def test_enrich_sets_metadata(enricher, mocker): metadata.media = [media1, media2] enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {} - enricher.enrich(metadata) + enricher.enrich(metadata, ["key"]) media1.set.assert_called_once_with("metadata", {"key": "value"}) media2.set.assert_not_called() assert metadata.media == [media1, media2] +def test_enrich_no_metadata_selection(enricher, mocker): + media1 = mocker.Mock(filename="img1.jpg") + media2 = mocker.Mock(filename="img2.jpg") + metadata = mocker.Mock() + metadata.media = [media1, media2] + enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {} + enricher.enrich(metadata) + media1.set.assert_called_once_with("metadata", {}) + media2.set.assert_not_called() + assert metadata.media == [media1, media2] + + def test_enrich_empty_media(enricher, mocker): metadata = mocker.Mock() metadata.media = [] @@ -71,7 +83,9 @@ def test_get_metadata_error_handling(enricher, mocker): assert "Error occurred: " in mock_log.call_args[0][0] -def test_metadata_pickle(enricher, unpickle, mocker): +# TODO depends on the expected functionality +""" +def test_default_metadata_pickle(enricher, unpickle, mocker): mock_run = mocker.patch("subprocess.run") # Uses pickled values mock_run.return_value = unpickle("metadata_enricher_exif.pickle") @@ -79,6 +93,37 @@ def test_metadata_pickle(enricher, unpickle, mocker): expected = unpickle("metadata_enricher_ytshort_expected.pickle") enricher.enrich(metadata) expected_media = expected.media + print(expected_media) actual_media = metadata.media + assert len(expected_media) == len(actual_media) assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata") +""" + + +def test_metadata_pickle_megapixel(enricher, unpickle, mocker): + mock_run = mocker.patch("subprocess.run") + mock_run.return_value = unpickle("metadata_enricher_exif.pickle") + metadata = unpickle("metadata_enricher_ytshort_input.pickle") + + enricher.enrich(metadata, ["megapixels"]) + actual_media = metadata.media + + assert actual_media[0].properties.get("metadata") == {"Megapixels": "0.922"} + + +def test_metadata_specify_datetime_and_metapixels(enricher, unpickle, mocker): + mock_run = mocker.patch("subprocess.run") + mock_run.return_value = unpickle("metadata_enricher_exif.pickle") + metadata = unpickle("metadata_enricher_ytshort_input.pickle") + # expected_md = {"Metapixels":"0.922", "File Inode Change Date/Time":"2025:02:18 19:42:50+00:00"} + + enricher.enrich(metadata, ["datetime", "megapixels"]) + actual_media = metadata.media + + assert actual_media[0].properties.get("metadata") == { + "File Modification Date/Time": "2025:02:18 19:42:50+00:00", + "File Access Date/Time": "2025:02:18 19:42:50+00:00", + "File Inode Change Date/Time": "2025:02:18 19:42:50+00:00", + "Megapixels": "0.922", + } From 53dc9904ce36b49ea7b92b14aa5b51a7306cbbec Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 8 Jan 2026 14:30:26 +0000 Subject: [PATCH 2/3] refactorws PR to obey standard code approach --- src/auto_archiver/core/orchestrator.py | 8 -------- .../modules/metadata_enricher/__manifest__.py | 7 +++++++ .../metadata_enricher/metadata_enricher.py | 19 +++++++++---------- tests/enrichers/test_metadata_enricher.py | 10 ++++++---- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 27eb612..9d914d1 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -274,14 +274,6 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ default=False, ) - parser.add_argument( - "--metadata", - dest="requested_metadata", - help="An array of specific metadata fields to select from the collected content.", - default=[], - nargs="?", - ) - def add_individual_module_args( self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None ) -> None: diff --git a/src/auto_archiver/modules/metadata_enricher/__manifest__.py b/src/auto_archiver/modules/metadata_enricher/__manifest__.py index 3727551..f5defcc 100644 --- a/src/auto_archiver/modules/metadata_enricher/__manifest__.py +++ b/src/auto_archiver/modules/metadata_enricher/__manifest__.py @@ -3,6 +3,13 @@ "type": ["enricher"], "requires_setup": True, "dependencies": {"python": ["loguru"], "bin": ["exiftool"]}, + "configs": { + "look_for_keys": { + "default": [], + "help": "list of lowercased metadata keys that will be included in the enriched metadata. Special keys: 'author', 'datetimes', 'location' to include related metadata fields. The default empty list `[]` means all metadata will be included.", + "type": "list", + }, + }, "description": """ Extracts metadata information from files using ExifTool. diff --git a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py index c0a43bd..b7488cf 100644 --- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py +++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py @@ -11,15 +11,14 @@ class MetadataEnricher(Enricher): Extracts metadata information from files using exiftool. """ - def enrich(self, to_enrich: Metadata, md_grocery_list=["author", "datetimes", "location"]) -> None: + def enrich(self, to_enrich: Metadata) -> None: logger.debug("Extracting EXIF metadata") for i, m in enumerate(to_enrich.media): if len(md := self.get_metadata(m.filename)): - # feature flag has this currently turned on - specified_md = self.select_metadata(md, md_grocery_list) - to_enrich.media[i].set("metadata", specified_md) - # to_enrich.media[i].set("metadata", md) + if self.look_for_keys != []: + md = self.select_metadata(md, self.look_for_keys) + to_enrich.media[i].set("metadata", md) def get_metadata(self, filename: str) -> dict: try: @@ -38,7 +37,7 @@ class MetadataEnricher(Enricher): logger.error(f"Error occurred: {e}: {traceback.format_exc()}") return {} - def select_metadata(self, all_md, md_grocery_list): + def select_metadata(self, all_md, requested_metadata_keys): """ coordinates the selection of metadata from the general exiftool output to the user-specified grocery list """ @@ -51,19 +50,19 @@ class MetadataEnricher(Enricher): for md_key in all_md.keys(): md_key_lower = md_key.lower() # checking for special baskets within the grocery list of requested metadata - if ("author" in md_grocery_list) and any( + if ("author" in requested_metadata_keys) and any( term in md_key_lower and len(all_md[md_key]) for term in author_key_terms ): specified_md[md_key] = all_md[md_key] - if ("datetime" in md_grocery_list) and any( + if ("datetime" in requested_metadata_keys) and any( term in md_key_lower and len(all_md[md_key]) for term in datetime_key_terms ): specified_md[md_key] = all_md[md_key] - if ("location" in md_grocery_list) and any( + if ("location" in requested_metadata_keys) and any( term in md_key_lower and len(all_md[md_key]) for term in location_key_terms ): specified_md[md_key] = all_md[md_key] # if the metadata value is requested directly - if md_key_lower in md_grocery_list or md_key in md_grocery_list and len(all_md[md_key]): + if md_key_lower in requested_metadata_keys or md_key in requested_metadata_keys and len(all_md[md_key]): specified_md[md_key] = all_md[md_key] return specified_md diff --git a/tests/enrichers/test_metadata_enricher.py b/tests/enrichers/test_metadata_enricher.py index 224a0ba..83da4bb 100644 --- a/tests/enrichers/test_metadata_enricher.py +++ b/tests/enrichers/test_metadata_enricher.py @@ -49,7 +49,7 @@ def test_enrich_sets_metadata(enricher, mocker): metadata.media = [media1, media2] enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {} - enricher.enrich(metadata, ["key"]) + enricher.enrich(metadata) media1.set.assert_called_once_with("metadata", {"key": "value"}) media2.set.assert_not_called() @@ -62,6 +62,7 @@ def test_enrich_no_metadata_selection(enricher, mocker): metadata = mocker.Mock() metadata.media = [media1, media2] enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {} + enricher.look_for_keys = ["no-key"] enricher.enrich(metadata) media1.set.assert_called_once_with("metadata", {}) media2.set.assert_not_called() @@ -106,7 +107,8 @@ def test_metadata_pickle_megapixel(enricher, unpickle, mocker): mock_run.return_value = unpickle("metadata_enricher_exif.pickle") metadata = unpickle("metadata_enricher_ytshort_input.pickle") - enricher.enrich(metadata, ["megapixels"]) + enricher.look_for_keys = ["megapixels"] + enricher.enrich(metadata) actual_media = metadata.media assert actual_media[0].properties.get("metadata") == {"Megapixels": "0.922"} @@ -116,9 +118,9 @@ def test_metadata_specify_datetime_and_metapixels(enricher, unpickle, mocker): mock_run = mocker.patch("subprocess.run") mock_run.return_value = unpickle("metadata_enricher_exif.pickle") metadata = unpickle("metadata_enricher_ytshort_input.pickle") - # expected_md = {"Metapixels":"0.922", "File Inode Change Date/Time":"2025:02:18 19:42:50+00:00"} - enricher.enrich(metadata, ["datetime", "megapixels"]) + enricher.look_for_keys = ["datetime", "megapixels"] + enricher.enrich(metadata) actual_media = metadata.media assert actual_media[0].properties.get("metadata") == { From bac809451c869354a92643ba4b882386ed40c08e Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 8 Jan 2026 14:33:16 +0000 Subject: [PATCH 3/3] expands tests to included non predefined metadata keys --- tests/enrichers/test_metadata_enricher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/enrichers/test_metadata_enricher.py b/tests/enrichers/test_metadata_enricher.py index 83da4bb..f784824 100644 --- a/tests/enrichers/test_metadata_enricher.py +++ b/tests/enrichers/test_metadata_enricher.py @@ -119,7 +119,7 @@ def test_metadata_specify_datetime_and_metapixels(enricher, unpickle, mocker): mock_run.return_value = unpickle("metadata_enricher_exif.pickle") metadata = unpickle("metadata_enricher_ytshort_input.pickle") - enricher.look_for_keys = ["datetime", "megapixels"] + enricher.look_for_keys = ["datetime", "megapixels", "image height"] enricher.enrich(metadata) actual_media = metadata.media @@ -128,4 +128,5 @@ def test_metadata_specify_datetime_and_metapixels(enricher, unpickle, mocker): "File Access Date/Time": "2025:02:18 19:42:50+00:00", "File Inode Change Date/Time": "2025:02:18 19:42:50+00:00", "Megapixels": "0.922", + "Image Height": "720", }