mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
new archiver, new hack, ready
This commit is contained in:
@@ -6,4 +6,5 @@ from .tiktok_archiver import TiktokArchiver
|
||||
from .wayback_archiver import WaybackArchiver
|
||||
from .youtubedl_archiver import YoutubeDLArchiver
|
||||
from .twitter_archiver import TwitterArchiver
|
||||
from .vk_archiver import VkArchiver
|
||||
from .vk_archiver import VkArchiver
|
||||
from .twitter_api_archiver import TwitterApiArchiver
|
||||
@@ -149,9 +149,13 @@ class Archiver(ABC):
|
||||
if a string is passed in @with_extension the slug is appended with it if there is no "." in the slug
|
||||
if @append_date is true, the key adds a timestamp after the URL slug and before the extension
|
||||
"""
|
||||
slug = slugify(urlparse(url).path)
|
||||
url_path = urlparse(url).path
|
||||
path, ext = os.path.splitext(url_path)
|
||||
slug = slugify(path)
|
||||
if append_datetime:
|
||||
slug += "-" + slugify(datetime.datetime.utcnow().isoformat())
|
||||
if len(ext):
|
||||
slug += ext
|
||||
if with_extension is not None:
|
||||
if "." not in slug:
|
||||
slug += with_extension
|
||||
|
||||
@@ -41,7 +41,7 @@ class TelethonArchiver(Archiver):
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
if not hasattr(self, "client"):
|
||||
logger.error('Missing Telethon config')
|
||||
logger.warning('Missing Telethon config')
|
||||
return False
|
||||
|
||||
# detect URLs that we definitely cannot handle
|
||||
@@ -80,7 +80,6 @@ class TelethonArchiver(Archiver):
|
||||
if check_if_exists and self.storage.exists(key):
|
||||
# only s3 storage supports storage.exists as not implemented on gd
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
status = 'already archived'
|
||||
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)
|
||||
|
||||
group_id = post.grouped_id if post.grouped_id is not None else post.id
|
||||
|
||||
73
archivers/twitter_api_archiver.py
Normal file
73
archivers/twitter_api_archiver.py
Normal file
@@ -0,0 +1,73 @@
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
from pytwitter import Api
|
||||
|
||||
from storages.base_storage import Storage
|
||||
from configs import TwitterApiConfig
|
||||
from .base_archiver import ArchiveResult
|
||||
from .twitter_archiver import TwitterArchiver
|
||||
|
||||
|
||||
class TwitterApiArchiver(TwitterArchiver):
|
||||
name = "twitter_api"
|
||||
|
||||
def __init__(self, storage: Storage, driver, config: TwitterApiConfig):
|
||||
super().__init__(storage, driver)
|
||||
|
||||
if config.bearer_token:
|
||||
self.api = Api(bearer_token=config.bearer_token)
|
||||
elif config.consumer_key and config.consumer_secret and config.access_token and config.access_secret:
|
||||
self.api = Api(
|
||||
consumer_key=config.consumer_key, consumer_secret=config.consumer_secret, access_token=config.access_token, access_secret=config.access_secret)
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
if not hasattr(self, "api"):
|
||||
logger.warning('Missing Twitter API config')
|
||||
return False
|
||||
|
||||
username, tweet_id = self.get_username_tweet_id(url)
|
||||
if not username: return False
|
||||
|
||||
tweet = self.api.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"])
|
||||
timestamp = datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
|
||||
# check if exists
|
||||
key = self.get_html_key(url)
|
||||
if check_if_exists and self.storage.exists(key):
|
||||
# only s3 storage supports storage.exists as not implemented on gd
|
||||
cdn_url = self.storage.get_cdn_url(key)
|
||||
screenshot = self.get_screenshot(url)
|
||||
return ArchiveResult(status='already archived', cdn_url=cdn_url, title=tweet.data.text, timestamp=timestamp, screenshot=screenshot)
|
||||
|
||||
urls = []
|
||||
if tweet.includes:
|
||||
for m in tweet.includes.media:
|
||||
if m.url:
|
||||
urls.append(m.url)
|
||||
elif hasattr(m, "variants"):
|
||||
var_url = self.choose_variant(m.variants)
|
||||
urls.append(var_url)
|
||||
else:
|
||||
urls.append(None) # will trigger error
|
||||
|
||||
for u in urls:
|
||||
if u is None:
|
||||
logger.error(f"Should not have gotten None url for {tweet.includes.media=}")
|
||||
return self.download_alternative(url, tweet_id)
|
||||
logger.debug(f"found {urls=}")
|
||||
|
||||
output = json.dumps({
|
||||
"id": tweet.data.id,
|
||||
"text": tweet.data.text,
|
||||
"created_at": tweet.data.created_at,
|
||||
"author_id": tweet.data.author_id,
|
||||
"geo": tweet.data.geo,
|
||||
"lang": tweet.data.lang,
|
||||
"media": urls
|
||||
}, ensure_ascii=False, indent=4)
|
||||
|
||||
screenshot = self.get_screenshot(url)
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, output)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet.data.text)
|
||||
@@ -1,6 +1,5 @@
|
||||
|
||||
import html
|
||||
from urllib.parse import urlparse
|
||||
import html, re, requests
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
||||
|
||||
@@ -9,20 +8,21 @@ from .base_archiver import Archiver, ArchiveResult
|
||||
|
||||
class TwitterArchiver(Archiver):
|
||||
name = "twitter"
|
||||
link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||
|
||||
def get_username_tweet_id(self, url):
|
||||
# detect URLs that we definitely cannot handle
|
||||
matches = self.link_pattern.findall(url)
|
||||
if not len(matches): return False, False
|
||||
|
||||
username, tweet_id = matches[0] # only one URL supported
|
||||
logger.debug(f"Found {username=} and {tweet_id=} in {url=}")
|
||||
|
||||
return username, tweet_id
|
||||
|
||||
def download(self, url, check_if_exists=False):
|
||||
|
||||
if 'twitter.com' != self.get_netloc(url):
|
||||
logger.debug(f'{url=} is not from twitter')
|
||||
return False
|
||||
|
||||
tweet_id = urlparse(url).path.split('/')
|
||||
if 'status' in tweet_id:
|
||||
i = tweet_id.index('status')
|
||||
tweet_id = tweet_id[i + 1]
|
||||
else:
|
||||
logger.debug(f'{url=} does not contain "status"')
|
||||
return False
|
||||
username, tweet_id = self.get_username_tweet_id(url)
|
||||
if not username: return False
|
||||
|
||||
scr = TwitterTweetScraper(tweet_id)
|
||||
|
||||
@@ -30,7 +30,7 @@ class TwitterArchiver(Archiver):
|
||||
tweet = next(scr.get_items())
|
||||
except Exception as ex:
|
||||
logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
|
||||
return False
|
||||
return self.download_alternative(url, tweet_id)
|
||||
|
||||
if tweet.media is None:
|
||||
logger.debug(f'No media found, archiving tweet text only')
|
||||
@@ -57,3 +57,40 @@ class TwitterArchiver(Archiver):
|
||||
screenshot = self.get_screenshot(url)
|
||||
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content)
|
||||
|
||||
def download_alternative(self, url, tweet_id):
|
||||
logger.debug(f"Trying twitter hack for {url=}")
|
||||
hack_url = f"https://cdn.syndication.twimg.com/tweet?id={tweet_id}"
|
||||
r = requests.get(hack_url)
|
||||
if r.status_code != 200: return False
|
||||
tweet = r.json()
|
||||
|
||||
urls = []
|
||||
for p in tweet["photos"]:
|
||||
urls.append(p["url"])
|
||||
|
||||
# 1 tweet has 1 video max
|
||||
v = tweet["video"]
|
||||
urls.append(self.choose_variant(v.get("variants", [])))
|
||||
|
||||
logger.debug(f"Twitter hack got {urls=}")
|
||||
|
||||
timestamp = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
screenshot = self.get_screenshot(url)
|
||||
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, r.text)
|
||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=timestamp, title=tweet["text"])
|
||||
|
||||
def choose_variant(self, variants):
|
||||
# choosing the highest quality possible
|
||||
variant, width, height = None, 0, 0
|
||||
for var in variants:
|
||||
if var["type"] == "video/mp4":
|
||||
width_height = re.search(r"\/(\d+)x(\d+)\/", var["src"])
|
||||
if width_height:
|
||||
w, h = int(width_height[1]), int(width_height[2])
|
||||
if w > width or h > height:
|
||||
width, height = w, h
|
||||
variant = var.get("src", variant)
|
||||
else:
|
||||
variant = var.get("src") if not variant else variant
|
||||
return variant
|
||||
|
||||
Reference in New Issue
Block a user