mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4d80ee6f02 | ||
|
|
92569ae6be | ||
|
|
abaf86c776 | ||
|
|
8005a1955a | ||
|
|
b7889a182d |
2
Pipfile
2
Pipfile
@@ -30,7 +30,7 @@ tqdm = "*"
|
|||||||
jinja2 = "*"
|
jinja2 = "*"
|
||||||
cryptography = "*"
|
cryptography = "*"
|
||||||
dataclasses-json = "*"
|
dataclasses-json = "*"
|
||||||
yt-dlp = ">=2023.2.17"
|
yt-dlp = "*"
|
||||||
vk-url-scraper = "*"
|
vk-url-scraper = "*"
|
||||||
uwsgi = "*"
|
uwsgi = "*"
|
||||||
requests = {extras = ["socks"], version = "*"}
|
requests = {extras = ["socks"], version = "*"}
|
||||||
|
|||||||
2
Pipfile.lock
generated
2
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"_meta": {
|
"_meta": {
|
||||||
"hash": {
|
"hash": {
|
||||||
"sha256": "6e76638769e56f28c2cc56e548d3ac1752b36db2160e23a865089c80e584dcba"
|
"sha256": "84ebe4378c02b26d0663f6d7ede49064ec7428dddca668c8d8a5d64cf9191f09"
|
||||||
},
|
},
|
||||||
"pipfile-spec": 6,
|
"pipfile-spec": 6,
|
||||||
"requires": {
|
"requires": {
|
||||||
|
|||||||
@@ -197,7 +197,8 @@ Outputs:
|
|||||||
* **Title**: Post title
|
* **Title**: Post title
|
||||||
* **Text**: Post text
|
* **Text**: Post text
|
||||||
* **Screenshot**: Link to screenshot of post
|
* **Screenshot**: Link to screenshot of post
|
||||||
* **Hash**: Hash of archived HTML file (which contains hashes of post media)
|
* **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification
|
||||||
|
* **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content
|
||||||
* **WACZ**: Link to a WACZ web archive of post
|
* **WACZ**: Link to a WACZ web archive of post
|
||||||
* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive
|
* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive
|
||||||
|
|
||||||
|
|||||||
@@ -48,7 +48,8 @@ class TelegramArchiver(Archiver):
|
|||||||
video = s.find("video")
|
video = s.find("video")
|
||||||
if video is None:
|
if video is None:
|
||||||
logger.warning("could not find video")
|
logger.warning("could not find video")
|
||||||
image_tags = s.find_all(class_="js-message_photo")
|
image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
|
||||||
|
logger.info(image_tags)
|
||||||
|
|
||||||
image_urls = []
|
image_urls = []
|
||||||
for im in image_tags:
|
for im in image_tags:
|
||||||
|
|||||||
@@ -90,20 +90,36 @@ class TwitterArchiver(Archiver):
|
|||||||
|
|
||||||
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
|
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
|
||||||
"""
|
"""
|
||||||
CURRENTLY STOPPED WORKING
|
Hack alternative working again.
|
||||||
|
https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
|
||||||
|
https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-1615937362
|
||||||
|
next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
|
||||||
"""
|
"""
|
||||||
return False
|
headers = {
|
||||||
# https://stackoverflow.com/a/71867055/6196010
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0",
|
||||||
|
"Accept": "*/*",
|
||||||
|
"Accept-Language": "en-US,en;q=0.5",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br",
|
||||||
|
"Origin": "https://platform.twitter.com",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Referer": "https://platform.twitter.com/",
|
||||||
|
"Sec-Fetch-Dest": "empty",
|
||||||
|
"Sec-Fetch-Mode": "cors",
|
||||||
|
"Sec-Fetch-Site": "cross-site",
|
||||||
|
"Pragma": "no-cache",
|
||||||
|
"Cache-Control": "no-cache",
|
||||||
|
"TE": "trailers"
|
||||||
|
}
|
||||||
logger.debug(f"Trying twitter hack for {url=}")
|
logger.debug(f"Trying twitter hack for {url=}")
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
|
|
||||||
hack_url = f"https://cdn.syndication.twimg.com/tweet?id={tweet_id}"
|
hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
|
||||||
r = requests.get(hack_url)
|
r = requests.get(hack_url)
|
||||||
if r.status_code != 200: return False
|
if r.status_code != 200: return False
|
||||||
tweet = r.json()
|
tweet = r.json()
|
||||||
|
|
||||||
urls = []
|
urls = []
|
||||||
for p in tweet["photos"]:
|
for p in tweet.get("photos", []):
|
||||||
urls.append(p["url"])
|
urls.append(p["url"])
|
||||||
|
|
||||||
# 1 tweet has 1 video max
|
# 1 tweet has 1 video max
|
||||||
@@ -113,14 +129,18 @@ class TwitterArchiver(Archiver):
|
|||||||
|
|
||||||
logger.debug(f"Twitter hack got {urls=}")
|
logger.debug(f"Twitter hack got {urls=}")
|
||||||
|
|
||||||
for u in urls:
|
for i, u in enumerate(urls):
|
||||||
media = Media()
|
media = Media(filename="")
|
||||||
media.set("src", u)
|
media.set("src", u)
|
||||||
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}', item)
|
ext = ""
|
||||||
|
if (mtype := mimetypes.guess_type(u)[0]):
|
||||||
|
ext = mimetypes.guess_extension(mtype)
|
||||||
|
|
||||||
|
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)
|
||||||
result.add_media(media)
|
result.add_media(media)
|
||||||
|
|
||||||
result.set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
|
result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
|
||||||
return result
|
return result.success("twitter-hack")
|
||||||
|
|
||||||
def get_username_tweet_id(self, url):
|
def get_username_tweet_id(self, url):
|
||||||
# detect URLs that we definitely cannot handle
|
# detect URLs that we definitely cannot handle
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ class GsheetsDb(Database):
|
|||||||
batch_if_valid('title', item.get_title())
|
batch_if_valid('title', item.get_title())
|
||||||
batch_if_valid('text', item.get("content", ""))
|
batch_if_valid('text', item.get("content", ""))
|
||||||
batch_if_valid('timestamp', item.get_timestamp())
|
batch_if_valid('timestamp', item.get_timestamp())
|
||||||
batch_if_valid('hash', media.get("hash", "not-calculated"))
|
if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
|
||||||
|
|
||||||
# merge all pdq hashes into a single string, if present
|
# merge all pdq hashes into a single string, if present
|
||||||
pdq_hashes = []
|
pdq_hashes = []
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ _MAJOR = "0"
|
|||||||
_MINOR = "5"
|
_MINOR = "5"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "25"
|
_PATCH = "27"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
Reference in New Issue
Block a user