implementing default metadata omission/user metadata selection

This commit is contained in:
mgaughan
2025-07-23 14:22:11 -04:00
parent 1256fde159
commit 94e0803fb3
3 changed files with 90 additions and 5 deletions

View File

@@ -274,6 +274,14 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
default=False,
)
parser.add_argument(
"--metadata",
dest="requested_metadata",
help="An array of specific metadata fields to select from the collected content.",
default=[],
nargs="?",
)
def add_individual_module_args(
self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None
) -> None:

View File

@@ -11,19 +11,21 @@ class MetadataEnricher(Enricher):
Extracts metadata information from files using exiftool.
"""
def enrich(self, to_enrich: Metadata) -> None:
def enrich(self, to_enrich: Metadata, md_grocery_list=["author", "datetimes", "location"]) -> None:
logger.debug("Extracting EXIF metadata")
for i, m in enumerate(to_enrich.media):
if len(md := self.get_metadata(m.filename)):
to_enrich.media[i].set("metadata", md)
# feature flag has this currently turned on
specified_md = self.select_metadata(md, md_grocery_list)
to_enrich.media[i].set("metadata", specified_md)
# to_enrich.media[i].set("metadata", md)
def get_metadata(self, filename: str) -> dict:
try:
# Run ExifTool command to extract metadata from the file
cmd = ["exiftool", filename]
result = subprocess.run(cmd, capture_output=True, text=True)
# Process the output to extract individual metadata fields
metadata = {}
for line in result.stdout.splitlines():
@@ -35,3 +37,33 @@ class MetadataEnricher(Enricher):
except Exception as e:
logger.error(f"Error occurred: {e}: {traceback.format_exc()}")
return {}
def select_metadata(self, all_md, md_grocery_list):
"""
coordinates the selection of metadata from the general exiftool output to the user-specified grocery list
"""
# defining the batches of metadata that get pulled for special terms
author_key_terms = ["author", "producer", "creator"]
datetime_key_terms = ["date", "time"]
location_key_terms = ["gps", "latitude", "longitude"]
specified_md = {}
for md_key in all_md.keys():
md_key_lower = md_key.lower()
# checking for special baskets within the grocery list of requested metadata
if ("author" in md_grocery_list) and any(
term in md_key_lower and len(all_md[md_key]) for term in author_key_terms
):
specified_md[md_key] = all_md[md_key]
if ("datetime" in md_grocery_list) and any(
term in md_key_lower and len(all_md[md_key]) for term in datetime_key_terms
):
specified_md[md_key] = all_md[md_key]
if ("location" in md_grocery_list) and any(
term in md_key_lower and len(all_md[md_key]) for term in location_key_terms
):
specified_md[md_key] = all_md[md_key]
# if the metadata value is requested directly
if md_key_lower in md_grocery_list or md_key in md_grocery_list and len(all_md[md_key]):
specified_md[md_key] = all_md[md_key]
return specified_md