mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
Ruff format with defaults.
This commit is contained in:
@@ -1 +1 @@
|
||||
from .telegram_extractor import TelegramExtractor
|
||||
from .telegram_extractor import TelegramExtractor
|
||||
|
||||
@@ -15,11 +15,11 @@ class TelegramExtractor(Extractor):
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
# detect URLs that we definitely cannot handle
|
||||
if 't.me' != item.netloc:
|
||||
if "t.me" != item.netloc:
|
||||
return False
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
|
||||
}
|
||||
|
||||
# TODO: check if we can do this more resilient to variable URLs
|
||||
@@ -27,11 +27,11 @@ class TelegramExtractor(Extractor):
|
||||
url += "?embed=1"
|
||||
|
||||
t = requests.get(url, headers=headers)
|
||||
s = BeautifulSoup(t.content, 'html.parser')
|
||||
s = BeautifulSoup(t.content, "html.parser")
|
||||
|
||||
result = Metadata()
|
||||
result.set_content(html.escape(str(t.content)))
|
||||
if (timestamp := (s.find_all('time') or [{}])[0].get('datetime')):
|
||||
if timestamp := (s.find_all("time") or [{}])[0].get("datetime"):
|
||||
result.set_timestamp(timestamp)
|
||||
|
||||
video = s.find("video")
|
||||
@@ -41,25 +41,26 @@ class TelegramExtractor(Extractor):
|
||||
|
||||
image_urls = []
|
||||
for im in image_tags:
|
||||
urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
|
||||
urls = [u.replace("'", "") for u in re.findall(r"url\((.*?)\)", im["style"])]
|
||||
image_urls += urls
|
||||
|
||||
if not len(image_urls): return False
|
||||
if not len(image_urls):
|
||||
return False
|
||||
for img_url in image_urls:
|
||||
result.add_media(Media(self.download_from_url(img_url)))
|
||||
else:
|
||||
video_url = video.get('src')
|
||||
video_url = video.get("src")
|
||||
m_video = Media(self.download_from_url(video_url))
|
||||
# extract duration from HTML
|
||||
try:
|
||||
duration = s.find_all('time')[0].contents[0]
|
||||
if ':' in duration:
|
||||
duration = float(duration.split(
|
||||
':')[0]) * 60 + float(duration.split(':')[1])
|
||||
duration = s.find_all("time")[0].contents[0]
|
||||
if ":" in duration:
|
||||
duration = float(duration.split(":")[0]) * 60 + float(duration.split(":")[1])
|
||||
else:
|
||||
duration = float(duration)
|
||||
m_video.set("duration", duration)
|
||||
except: pass
|
||||
except:
|
||||
pass
|
||||
result.add_media(m_video)
|
||||
|
||||
return result.success("telegram")
|
||||
|
||||
Reference in New Issue
Block a user