From 3cf51dd8744d98848b31dee0e095d6167a61ca40 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:56:42 +0100 Subject: [PATCH] adds tracker remove feature and tests --- src/auto_archiver/core/orchestrator.py | 4 +-- src/auto_archiver/utils/url.py | 8 ++++-- tests/utils/test_urls.py | 37 ++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 4 deletions(-) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 0bff376..66073a7 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -34,7 +34,7 @@ from .config import ( from .module import ModuleFactory, LazyBaseModule from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher from .consts import MODULE_TYPES, SetupError -from auto_archiver.utils.url import check_url_or_raise +from auto_archiver.utils.url import check_url_or_raise, clean if TYPE_CHECKING: from .base_module import BaseModule @@ -572,7 +572,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ raise e # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs - url = original_url + url = clean(original_url) for a in self.extractors: url = a.sanitize_url(url) diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py index 2bb19cf..a44a91d 100644 --- a/src/auto_archiver/utils/url.py +++ b/src/auto_archiver/utils/url.py @@ -1,5 +1,5 @@ import re -from urllib.parse import urlparse, urlunparse +from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse from ipaddress import ip_address @@ -53,7 +53,11 @@ def domain_for_url(url: str) -> str: def clean(url: str) -> str: - return url + TRACKERS = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "fbclid", "gclid"} + + parsed = urlparse(url) + clean_qs = [(k, v) for k, v in parse_qsl(parsed.query) if k not in TRACKERS] + return parsed._replace(query=urlencode(clean_qs)).geturl() def is_auth_wall(url: str) -> bool: diff --git a/tests/utils/test_urls.py b/tests/utils/test_urls.py index df8e0f3..2c77122 100644 --- a/tests/utils/test_urls.py +++ b/tests/utils/test_urls.py @@ -1,5 +1,6 @@ import pytest from auto_archiver.utils.url import ( + clean, is_auth_wall, check_url_or_raise, domain_for_url, @@ -158,3 +159,39 @@ def test_twitter_best_quality_url(url, best_quality): ) def test_get_media_url_best_quality(input_url, expected_url): assert get_media_url_best_quality(input_url) == expected_url + + +@pytest.mark.parametrize( + "input_url,expected_url", + [ + # No trackers present + ("https://example.com/page?foo=bar&baz=qux", "https://example.com/page?foo=bar&baz=qux"), + # Single tracker present + ("https://example.com/page?utm_source=google&foo=bar", "https://example.com/page?foo=bar"), + # Multiple trackers present + ("https://example.com/page?utm_source=google&utm_medium=email&utm_campaign=spring", "https://example.com/page"), + # Trackers mixed with other params + ( + "https://example.com/page?foo=bar&utm_content=abc&baz=qux&gclid=123", + "https://example.com/page?foo=bar&baz=qux", + ), + # Only trackers present + ("https://example.com/page?utm_source=google&gclid=123", "https://example.com/page"), + # No query string + ("https://example.com/page", "https://example.com/page"), + # Trackers in fragment (should not be removed) + ("https://example.com/page#utm_source=google", "https://example.com/page#utm_source=google"), + # Trackers after fragment + ("https://example.com/page?utm_source=google#section-1", "https://example.com/page#section-1"), + # Trackers with empty value + ("https://example.com/page?utm_source=&foo=bar", "https://example.com/page?foo=bar"), + # Trackers with multiple values + ("https://example.com/page?utm_source=google&utm_source=bing&foo=bar", "https://example.com/page?foo=bar"), + # Trackers with encoded values + ("https://example.com/page?utm_source=google%20ads&foo=bar", "https://example.com/page?foo=bar"), + # Unrelated param with similar name + ("https://example.com/page?utm_sourc=keepme&foo=bar", "https://example.com/page?utm_sourc=keepme&foo=bar"), + ], +) +def test_clean_removes_trackers(input_url, expected_url): + assert clean(input_url) == expected_url