adds tracker remove feature and tests

This commit is contained in:
msramalho
2025-06-11 11:56:42 +01:00
parent 69ddb72146
commit 3cf51dd874
3 changed files with 45 additions and 4 deletions

View File

@@ -34,7 +34,7 @@ from .config import (
from .module import ModuleFactory, LazyBaseModule
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
from .consts import MODULE_TYPES, SetupError
from auto_archiver.utils.url import check_url_or_raise
from auto_archiver.utils.url import check_url_or_raise, clean
if TYPE_CHECKING:
from .base_module import BaseModule
@@ -572,7 +572,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
raise e
# 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
url = original_url
url = clean(original_url)
for a in self.extractors:
url = a.sanitize_url(url)

View File

@@ -1,5 +1,5 @@
import re
from urllib.parse import urlparse, urlunparse
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
from ipaddress import ip_address
@@ -53,7 +53,11 @@ def domain_for_url(url: str) -> str:
def clean(url: str) -> str:
return url
TRACKERS = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "fbclid", "gclid"}
parsed = urlparse(url)
clean_qs = [(k, v) for k, v in parse_qsl(parsed.query) if k not in TRACKERS]
return parsed._replace(query=urlencode(clean_qs)).geturl()
def is_auth_wall(url: str) -> bool: