mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
* Removes (partly) the ArchivingOrchestrator * Removes the cli_feeder module, and makes it the 'default', allowing you to pass URLs directly on the command line, without having to use the cumbersome --cli_feeder.urls. Just do auto-archiver https://my.url.com * More unit tests * Improved error handling
43 lines
1.3 KiB
Python
43 lines
1.3 KiB
Python
import requests
|
|
from loguru import logger
|
|
|
|
from auto_archiver.core import Feeder
|
|
from auto_archiver.core import Metadata
|
|
|
|
|
|
class AtlosFeeder(Feeder):
|
|
|
|
def __iter__(self) -> Metadata:
|
|
# Get all the urls from the Atlos API
|
|
count = 0
|
|
cursor = None
|
|
while True:
|
|
response = requests.get(
|
|
f"{self.atlos_url}/api/v2/source_material",
|
|
headers={"Authorization": f"Bearer {self.api_token}"},
|
|
params={"cursor": cursor},
|
|
)
|
|
data = response.json()
|
|
response.raise_for_status()
|
|
cursor = data["next"]
|
|
|
|
for item in data["results"]:
|
|
if (
|
|
item["source_url"] not in [None, ""]
|
|
and (
|
|
item["metadata"]
|
|
.get("auto_archiver", {})
|
|
.get("processed", False)
|
|
!= True
|
|
)
|
|
and item["visibility"] == "visible"
|
|
and item["status"] not in ["processing", "pending"]
|
|
):
|
|
yield Metadata().set_url(item["source_url"]).set(
|
|
"atlos_id", item["id"]
|
|
)
|
|
count += 1
|
|
|
|
if len(data["results"]) == 0 or cursor is None:
|
|
break
|