mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
53 lines
1.7 KiB
Python
53 lines
1.7 KiB
Python
from loguru import logger
|
|
import requests
|
|
|
|
from auto_archiver.base_processors import Feeder
|
|
from auto_archiver.core import Metadata, ArchivingContext
|
|
from auto_archiver.utils import get_atlos_config_options
|
|
|
|
|
|
class AtlosFeeder(Feeder):
|
|
name = "atlos_feeder"
|
|
|
|
def __init__(self, config: dict) -> None:
|
|
# without this STEP.__init__ is not called
|
|
super().__init__(config)
|
|
if type(self.api_token) != str:
|
|
raise Exception("Atlos Feeder did not receive an Atlos API token")
|
|
|
|
def __iter__(self) -> Metadata:
|
|
# Get all the urls from the Atlos API
|
|
count = 0
|
|
cursor = None
|
|
while True:
|
|
response = requests.get(
|
|
f"{self.atlos_url}/api/v2/source_material",
|
|
headers={"Authorization": f"Bearer {self.api_token}"},
|
|
params={"cursor": cursor},
|
|
)
|
|
data = response.json()
|
|
response.raise_for_status()
|
|
cursor = data["next"]
|
|
|
|
for item in data["results"]:
|
|
if (
|
|
item["source_url"] not in [None, ""]
|
|
and (
|
|
item["metadata"]
|
|
.get("auto_archiver", {})
|
|
.get("processed", False)
|
|
!= True
|
|
)
|
|
and item["visibility"] == "visible"
|
|
and item["status"] not in ["processing", "pending"]
|
|
):
|
|
yield Metadata().set_url(item["source_url"]).set(
|
|
"atlos_id", item["id"]
|
|
)
|
|
count += 1
|
|
|
|
if len(data["results"]) == 0 or cursor is None:
|
|
break
|
|
|
|
logger.success(f"Processed {count} URL(s)")
|