From f3ce22666562bed2780181dbef95b8dee5a5e69e Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Mon, 21 Feb 2022 14:19:09 +0100
Subject: [PATCH] split into multiple files MVP
---
.gitignore | 3 +-
Pipfile | 1 -
Pipfile.lock | 145 +-----------
README.md | 2 +
archivers.py | 390 --------------------------------
archivers/__init__.py | 6 +
archivers/base_archiver.py | 115 ++++++++++
archivers/telegram_archiver.py | 76 +++++++
archivers/tiktok_archiver.py | 68 ++++++
archivers/wayback_archiver.py | 73 ++++++
archivers/youtubedl_archiver.py | 88 +++++++
auto_archive.py | 15 +-
12 files changed, 446 insertions(+), 536 deletions(-)
delete mode 100644 archivers.py
create mode 100644 archivers/__init__.py
create mode 100644 archivers/base_archiver.py
create mode 100644 archivers/telegram_archiver.py
create mode 100644 archivers/tiktok_archiver.py
create mode 100644 archivers/wayback_archiver.py
create mode 100644 archivers/youtubedl_archiver.py
diff --git a/.gitignore b/.gitignore
index b6a6b68..5d7eec9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,8 @@
tmp/
-.env
+.env*
.DS_Store
expmt/
service_account.json
__pycache__/
._*
+anu.html
\ No newline at end of file
diff --git a/Pipfile b/Pipfile
index 0d954c9..27071fa 100644
--- a/Pipfile
+++ b/Pipfile
@@ -10,7 +10,6 @@ python-dotenv = "*"
youtube_dl = "*"
argparse = "*"
beautifulsoup4 = "*"
-nordvpn-switcher = "*"
tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"}
bs4 = "*"
loguru = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
index b354d59..9879884 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "af39efbad8c78641a732697001193b5f4f92a0af8a9709081428001362a47060"
+ "sha256": "9a5218275503e5ae779407349d0a76f44712dc4824e066b10aeb047264a168be"
},
"pipfile-spec": 6,
"requires": {
@@ -93,6 +93,14 @@
],
"version": "==1.2.58"
},
+ "faker": {
+ "hashes": [
+ "sha256:ee8d9181137cdd2b198bd3d0653b0a3b7b385213862348e15ba8a423324b702b",
+ "sha256:f545b2a1ba5f7effc4ed71af0a5204d939445f0190838d41bee6bc160958bfbe"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==13.0.0"
+ },
"ffmpeg-python": {
"hashes": [
"sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
@@ -180,73 +188,6 @@
"index": "pypi",
"version": "==0.6.0"
},
- "lxml": {
- "hashes": [
- "sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169",
- "sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428",
- "sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc",
- "sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85",
- "sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696",
- "sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507",
- "sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3",
- "sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430",
- "sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03",
- "sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9",
- "sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b",
- "sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7",
- "sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5",
- "sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654",
- "sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca",
- "sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9",
- "sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c",
- "sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63",
- "sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe",
- "sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9",
- "sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9",
- "sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1",
- "sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939",
- "sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68",
- "sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613",
- "sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63",
- "sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e",
- "sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4",
- "sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79",
- "sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1",
- "sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e",
- "sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141",
- "sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb",
- "sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939",
- "sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a",
- "sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93",
- "sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9",
- "sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2",
- "sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6",
- "sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa",
- "sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150",
- "sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea",
- "sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33",
- "sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76",
- "sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807",
- "sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a",
- "sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4",
- "sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15",
- "sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f",
- "sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429",
- "sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c",
- "sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5",
- "sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870",
- "sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b",
- "sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8",
- "sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c",
- "sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87",
- "sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0",
- "sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23",
- "sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170",
- "sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f"
- ],
- "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
- "version": "==4.8.0"
- },
"markupsafe": {
"hashes": [
"sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3",
@@ -293,14 +234,6 @@
"markers": "python_version >= '3.7'",
"version": "==2.1.0"
},
- "nordvpn-switcher": {
- "hashes": [
- "sha256:764db054715d949af0f836da5e46c4053afe92282a0d4b2cfc6b8cfe8c3045de",
- "sha256:9788c2c3113d0d7b00894dae3ea19bed14f3b38d111d7223c126f001b1729a3b"
- ],
- "index": "pypi",
- "version": "==0.2.9"
- },
"oauthlib": {
"hashes": [
"sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2",
@@ -309,59 +242,6 @@
"markers": "python_version >= '3.6'",
"version": "==3.2.0"
},
- "pathlib": {
- "hashes": [
- "sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f"
- ],
- "version": "==1.0.1"
- },
- "psutil": {
- "hashes": [
- "sha256:072664401ae6e7c1bfb878c65d7282d4b4391f1bc9a56d5e03b5a490403271b5",
- "sha256:1070a9b287846a21a5d572d6dddd369517510b68710fca56b0e9e02fd24bed9a",
- "sha256:1d7b433519b9a38192dfda962dd8f44446668c009833e1429a52424624f408b4",
- "sha256:3151a58f0fbd8942ba94f7c31c7e6b310d2989f4da74fcbf28b934374e9bf841",
- "sha256:32acf55cb9a8cbfb29167cd005951df81b567099295291bcfd1027365b36591d",
- "sha256:3611e87eea393f779a35b192b46a164b1d01167c9d323dda9b1e527ea69d697d",
- "sha256:3d00a664e31921009a84367266b35ba0aac04a2a6cad09c550a89041034d19a0",
- "sha256:4e2fb92e3aeae3ec3b7b66c528981fd327fb93fd906a77215200404444ec1845",
- "sha256:539e429da49c5d27d5a58e3563886057f8fc3868a5547b4f1876d9c0f007bccf",
- "sha256:55ce319452e3d139e25d6c3f85a1acf12d1607ddedea5e35fb47a552c051161b",
- "sha256:58c7d923dc209225600aec73aa2c4ae8ea33b1ab31bc11ef8a5933b027476f07",
- "sha256:7336292a13a80eb93c21f36bde4328aa748a04b68c13d01dfddd67fc13fd0618",
- "sha256:742c34fff804f34f62659279ed5c5b723bb0195e9d7bd9907591de9f8f6558e2",
- "sha256:7641300de73e4909e5d148e90cc3142fb890079e1525a840cf0dfd39195239fd",
- "sha256:76cebf84aac1d6da5b63df11fe0d377b46b7b500d892284068bacccf12f20666",
- "sha256:7779be4025c540d1d65a2de3f30caeacc49ae7a2152108adeaf42c7534a115ce",
- "sha256:7d190ee2eaef7831163f254dc58f6d2e2a22e27382b936aab51c835fc080c3d3",
- "sha256:8293942e4ce0c5689821f65ce6522ce4786d02af57f13c0195b40e1edb1db61d",
- "sha256:869842dbd66bb80c3217158e629d6fceaecc3a3166d3d1faee515b05dd26ca25",
- "sha256:90a58b9fcae2dbfe4ba852b57bd4a1dded6b990a33d6428c7614b7d48eccb492",
- "sha256:9b51917c1af3fa35a3f2dabd7ba96a2a4f19df3dec911da73875e1edaf22a40b",
- "sha256:b2237f35c4bbae932ee98902a08050a27821f8f6dfa880a47195e5993af4702d",
- "sha256:c3400cae15bdb449d518545cbd5b649117de54e3596ded84aacabfbb3297ead2",
- "sha256:c51f1af02334e4b516ec221ee26b8fdf105032418ca5a5ab9737e8c87dafe203",
- "sha256:cb8d10461c1ceee0c25a64f2dd54872b70b89c26419e147a05a10b753ad36ec2",
- "sha256:d62a2796e08dd024b8179bd441cb714e0f81226c352c802fca0fd3f89eeacd94",
- "sha256:df2c8bd48fb83a8408c8390b143c6a6fa10cb1a674ca664954de193fdcab36a9",
- "sha256:e5c783d0b1ad6ca8a5d3e7b680468c9c926b804be83a3a8e95141b05c39c9f64",
- "sha256:e9805fed4f2a81de98ae5fe38b75a74c6e6ad2df8a5c479594c7629a1fe35f56",
- "sha256:ea42d747c5f71b5ccaa6897b216a7dadb9f52c72a0fe2b872ef7d3e1eacf3ba3",
- "sha256:ef216cc9feb60634bda2f341a9559ac594e2eeaadd0ba187a4c2eb5b5d40b91c",
- "sha256:ff0d41f8b3e9ebb6b6110057e40019a432e96aae2008951121ba4e56040b84f3"
- ],
- "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
- "version": "==5.9.0"
- },
- "py-mini-racer": {
- "hashes": [
- "sha256:346e73bb89a2024888244d487834be24a121089ceb0641dd0200cb96c4e24b57",
- "sha256:42896c24968481dd953eeeb11de331f6870917811961c9b26ba09071e07180e2",
- "sha256:97cab31bbf63ce462ba4cd6e978c572c916d8b15586156c7c5e0b2e42c10baab",
- "sha256:f71e36b643d947ba698c57cd9bd2232c83ca997b0802fc2f7f79582377040c11"
- ],
- "version": "==0.6.0"
- },
"pyasn1": {
"hashes": [
"sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359",
@@ -422,13 +302,6 @@
"index": "pypi",
"version": "==0.19.2"
},
- "random-user-agent": {
- "hashes": [
- "sha256:535636a55fb63fe3d74fd0260d854c241d9f2946447026464e578e68eac17dac",
- "sha256:8f8ca26ec8cb1d24ad1758d8b8f700d154064d641dbe9a255cfec42960fbd012"
- ],
- "version": "==1.0.1"
- },
"requests": {
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
diff --git a/README.md b/README.md
index 2e40bcc..cec6e9a 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,8 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta
[A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script.
+[ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
+
A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables:
```
diff --git a/archivers.py b/archivers.py
deleted file mode 100644
index d8a72f6..0000000
--- a/archivers.py
+++ /dev/null
@@ -1,390 +0,0 @@
-from dataclasses import dataclass
-import youtube_dl
-from bs4 import BeautifulSoup
-import requests
-import tiktok_downloader
-from loguru import logger
-import os
-import datetime
-import ffmpeg
-from botocore.errorfactory import ClientError
-import time
-import traceback
-
-# TODO There should be a better way of generating keys, that adds the following info:
-# - name of sheet that it is being archived from
-# (this means we might archive the same media twice on different sheets, but that's OK I think)
-# - name of archiver/platform that the video comes from
-# This should make it easier to maintain and clean the archive later
-
-# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
-# cleaned up? Difficult is we don't know the filename until the archivers start working.
-
-
-def get_cdn_url(key):
- return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
- os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
-
-
-def do_s3_upload(s3_client, f, key):
- s3_client.upload_fileobj(f, Bucket=os.getenv(
- 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
-
-
-def get_key(filename):
- key = filename.split('/')[1]
- if 'unknown_video' in key:
- key = key.replace('unknown_video', 'jpg')
- return key
-
-
-def get_thumbnails(filename, s3_client, duration=None):
- if not os.path.exists(filename.split('.')[0]):
- os.mkdir(filename.split('.')[0])
-
- fps = 0.5
- if duration is not None:
- duration = float(duration)
-
- if duration < 60:
- fps = 10.0 / duration
- elif duration < 120:
- fps = 20.0 / duration
- else:
- fps = 40.0 / duration
-
- stream = ffmpeg.input(filename)
- stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
- stream.output(filename.split('.')[0] + '/out%d.jpg').run()
-
- thumbnails = os.listdir(filename.split('.')[0] + '/')
- cdn_urls = []
-
- for fname in thumbnails:
- if fname[-3:] == 'jpg':
- thumbnail_filename = filename.split('.')[0] + '/' + fname
- key = filename.split('/')[1].split('.')[0] + '/' + fname
-
- cdn_url = get_cdn_url(key)
-
- with open(thumbnail_filename, 'rb') as f:
- do_s3_upload(s3_client, f, key)
-
- cdn_urls.append(cdn_url)
- os.remove(thumbnail_filename)
-
- if len(cdn_urls) == 0:
- return ('None', 'None')
-
- key_thumb = cdn_urls[int(len(cdn_urls)*0.1)]
-
- index_page = f'''
{filename}
- '''
-
- for t in cdn_urls:
- index_page += f'
'
-
- index_page += f""
- index_fname = filename.split('.')[0] + '/index.html'
-
- with open(index_fname, 'w') as f:
- f.write(index_page)
-
- thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
-
- s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
- 'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
-
- thumb_index_cdn_url = get_cdn_url(thumb_index)
-
- return (key_thumb, thumb_index_cdn_url)
-
-
-@dataclass
-class ArchiveResult:
- status: str
- cdn_url: str = None
- thumbnail: str = None
- thumbnail_index: str = None
- duration: float = None
- title: str = None
- timestamp: datetime.datetime = None
-
-
-class Archiver:
- def __init__(self, s3_client):
- self.s3 = s3_client
-
- def download(self, url):
- pass
-
-
-class TelegramArchiver(Archiver):
- def download(self, url, check_if_exists=False):
- # detect URLs that we definitely cannot handle
- if 'http://t.me/' not in url and 'https://t.me/' not in url:
- return False
-
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
- status = "success"
-
- original_url = url
-
- if url[-8:] != "?embed=1":
- url += "?embed=1"
-
- t = requests.get(url, headers=headers)
- s = BeautifulSoup(t.content, 'html.parser')
- video = s.find("video")
-
- if video is None:
- return False # could not find video
-
- video_url = video.get('src')
- key = video_url.split('/')[-1].split('?')[0]
- filename = 'tmp/' + key
-
- if check_if_exists:
- try:
- self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
- # file exists
- cdn_url = get_cdn_url(key)
-
- status = 'already archived'
-
- except ClientError:
- pass
-
- v = requests.get(video_url, headers=headers)
-
- with open(filename, 'wb') as f:
- f.write(v.content)
-
- if status != 'already archived':
- cdn_url = get_cdn_url(key)
-
- with open(filename, 'rb') as f:
- do_s3_upload(self.s3, f, key)
-
- # extract duration from HTML
- duration = s.find_all('time')[0].contents[0]
- if ':' in duration:
- duration = float(duration.split(
- ':')[0])*60 + float(duration.split(':')[1])
- else:
- duration = float(duration)
-
- # process thumbnails
- key_thumb, thumb_index = get_thumbnails(
- filename, self.s3, duration=duration)
- os.remove(filename)
-
- return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
- duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
-
-
-class YoutubeDLArchiver(Archiver):
- def download(self, url, check_if_exists=False):
- ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
- if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
- logger.info('Using Facebook cookie')
- youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
-
- ydl = youtube_dl.YoutubeDL(ydl_opts)
- cdn_url = None
- status = 'success'
-
- try:
- info = ydl.extract_info(url, download=False)
- except youtube_dl.utils.DownloadError:
- # no video here
- return False
-
- if 'is_live' in info and info['is_live']:
- logger.warning("Live streaming media, not archiving now")
- return ArchiveResult(status="Streaming media")
-
- if check_if_exists:
- if 'entries' in info:
- if len(info['entries']) > 1:
- logger.warning(
- 'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
- return False
-
- filename = ydl.prepare_filename(info['entries'][0])
- else:
- filename = ydl.prepare_filename(info)
-
- key = get_key(filename)
-
- try:
- self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
- # file exists
- cdn_url = get_cdn_url(key)
-
- status = 'already archived'
-
- except ClientError:
- pass
-
- # sometimes this results in a different filename, so do this again
- info = ydl.extract_info(url, download=True)
-
- if 'entries' in info:
- if len(info['entries']) > 1:
- logger.warning(
- 'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
- return False
- else:
- info = info['entries'][0]
-
- filename = ydl.prepare_filename(info)
-
- if not os.path.exists(filename):
- filename = filename.split('.')[0] + '.mkv'
-
- if status != 'already archived':
- key = get_key(filename)
- cdn_url = get_cdn_url(key)
-
- with open(filename, 'rb') as f:
- do_s3_upload(self.s3, f, key)
-
- # get duration
- duration = info['duration'] if 'duration' in info else None
-
- # get thumbnails
- key_thumb, thumb_index = get_thumbnails(
- filename, self.s3, duration=duration)
- os.remove(filename)
-
- return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
- title=info['title'] if 'title' in info else None,
- timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
-
-
-class WaybackArchiver(Archiver):
- def __init__(self, s3_client):
- self.s3 = s3_client
- self.seen_urls = {}
-
- def download(self, url, check_if_exists=False):
- if check_if_exists and url in self.seen_urls:
- return self.seen_urls[url]
-
- ia_headers = {
- "Accept": "application/json",
- "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
- }
-
- r = requests.post(
- 'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
-
- if r.status_code != 200:
- return ArchiveResult(status="Internet archive failed")
-
- job_id = r.json()['job_id']
-
- status_r = requests.get(
- 'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
-
- retries = 0
-
- # wait 90-120 seconds for the archive job to finish
- while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
- time.sleep(3)
-
- try:
- status_r = requests.get(
- 'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
- except:
- time.sleep(1)
-
- retries += 1
-
- if status_r.status_code != 200:
- return ArchiveResult(status="Internet archive failed")
-
- status_json = status_r.json()
-
- if status_json['status'] != 'success':
- return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
-
- archive_url = 'https://web.archive.org/web/' + \
- status_json['timestamp'] + '/' + status_json['original_url']
-
- try:
- r = requests.get(archive_url)
-
- parsed = BeautifulSoup(
- r.content, 'html.parser')
-
- title = parsed.find_all('title')[
- 0].text
- except:
- title = "Could not get title"
-
- result = ArchiveResult(
- status='Internet Archive fallback', cdn_url=archive_url, title=title)
- self.seen_urls[url] = result
- return result
-
-
-class TiktokArchiver(Archiver):
- def download(self, url, check_if_exists=False):
- if 'tiktok.com' not in url:
- return False
-
- status = 'success'
-
- try:
- info = tiktok_downloader.info_post(url)
- key = 'tiktok_' + str(info.id) + '.mp4'
- filename = 'tmp/' + key
-
- if check_if_exists:
- try:
- self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
- # file exists
- cdn_url = get_cdn_url(key)
-
- status = 'already archived'
-
- except ClientError:
- pass
-
- if status != 'already archived':
- media = tiktok_downloader.snaptik(url).get_media()
- if len(media) > 0:
- media[0].download(filename)
- with open(filename, 'rb') as f:
- do_s3_upload(self.s3, f, key)
-
- cdn_url = get_cdn_url(key)
- else:
- status = 'could not download media'
-
- try:
- key_thumb, thumb_index = get_thumbnails(
- filename, self.s3, duration=info.duration)
- except:
- key_thumb = ''
- thumb_index = 'error creating thumbnails'
-
- os.remove(filename)
-
- return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
- thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
-
- except tiktok_downloader.Except.InvalidUrl:
- status = 'Invalid URL'
- return ArchiveResult(status=status)
-
- except:
- error = traceback.format_exc()
- status = 'Other Tiktok error: ' + str(error)
- return ArchiveResult(status=status)
diff --git a/archivers/__init__.py b/archivers/__init__.py
new file mode 100644
index 0000000..e6c4ba6
--- /dev/null
+++ b/archivers/__init__.py
@@ -0,0 +1,6 @@
+# we need to explicitly expose the available imports here
+from .base_archiver import *
+from .telegram_archiver import *
+from .tiktok_archiver import *
+from .wayback_archiver import *
+from .youtubedl_archiver import *
\ No newline at end of file
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
new file mode 100644
index 0000000..3f9f4ac
--- /dev/null
+++ b/archivers/base_archiver.py
@@ -0,0 +1,115 @@
+import os
+import ffmpeg
+from dataclasses import dataclass
+import datetime
+from loguru import logger
+
+# TODO There should be a better way of generating keys, that adds the following info:
+# - name of sheet that it is being archived from
+# (this means we might archive the same media twice on different sheets, but that's OK I think)
+# - name of archiver/platform that the video comes from
+# This should make it easier to maintain and clean the archive later
+
+# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
+# cleaned up? Difficult is we don't know the filename until the archivers start working.
+
+
+@dataclass
+class ArchiveResult:
+ status: str
+ cdn_url: str = None
+ thumbnail: str = None
+ thumbnail_index: str = None
+ duration: float = None
+ title: str = None
+ timestamp: datetime.datetime = None
+
+
+class Archiver:
+ name = "default"
+
+ def __init__(self, s3_client):
+ self.s3 = s3_client
+
+ def __str__(self):
+ return self.__class__.__name__
+
+ def download(self, url, check_if_exists=False):
+ logger.error("method 'download' not implemented")
+
+ def get_cdn_url(self, key):
+ return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
+ os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
+
+ def do_s3_upload(self, f, key):
+ self.s3.upload_fileobj(f, Bucket=os.getenv(
+ 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
+
+ def get_key(self, filename):
+ print(f"key base implementation: {self.name}")
+ # TODO: refactor to be more manageable
+ key = filename.split('/')[1]
+ if 'unknown_video' in key:
+ key = key.replace('unknown_video', 'jpg')
+ return key
+
+ def get_thumbnails(self, filename, duration=None):
+ if not os.path.exists(filename.split('.')[0]):
+ os.mkdir(filename.split('.')[0])
+
+ fps = 0.5
+ if duration is not None:
+ duration = float(duration)
+
+ if duration < 60:
+ fps = 10.0 / duration
+ elif duration < 120:
+ fps = 20.0 / duration
+ else:
+ fps = 40.0 / duration
+
+ stream = ffmpeg.input(filename)
+ stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
+ stream.output(filename.split('.')[0] + '/out%d.jpg').run()
+
+ thumbnails = os.listdir(filename.split('.')[0] + '/')
+ cdn_urls = []
+
+ for fname in thumbnails:
+ if fname[-3:] == 'jpg':
+ thumbnail_filename = filename.split('.')[0] + '/' + fname
+ key = filename.split('/')[1].split('.')[0] + '/' + fname
+
+ cdn_url = self.get_cdn_url(key)
+
+ with open(thumbnail_filename, 'rb') as f:
+ self.do_s3_upload(f, key)
+
+ cdn_urls.append(cdn_url)
+ os.remove(thumbnail_filename)
+
+ if len(cdn_urls) == 0:
+ return ('None', 'None')
+
+ key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
+
+ index_page = f'''{filename}
+ '''
+
+ for t in cdn_urls:
+ index_page += f'
'
+
+ index_page += f""
+ index_fname = filename.split('.')[0] + '/index.html'
+
+ with open(index_fname, 'w') as f:
+ f.write(index_page)
+
+ thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
+
+ self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
+ 'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
+
+ thumb_index_cdn_url = self.get_cdn_url(thumb_index)
+
+ return (key_thumb, thumb_index_cdn_url)
diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
new file mode 100644
index 0000000..d9168e4
--- /dev/null
+++ b/archivers/telegram_archiver.py
@@ -0,0 +1,76 @@
+import os
+import requests
+from bs4 import BeautifulSoup
+from botocore.errorfactory import ClientError
+from .base_archiver import Archiver, ArchiveResult
+
+# TODO: get_cdn_url, get_thumbnails, do_s3_upload
+
+
+class TelegramArchiver(Archiver):
+ name = "telegram"
+
+ def download(self, url, check_if_exists=False):
+ # detect URLs that we definitely cannot handle
+ if 'http://t.me/' not in url and 'https://t.me/' not in url:
+ return False
+
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
+ }
+ status = "success"
+
+ original_url = url
+
+ # TODO: check if we can do this more resilient to user-input
+ if url[-8:] != "?embed=1":
+ url += "?embed=1"
+
+ t = requests.get(url, headers=headers)
+ s = BeautifulSoup(t.content, 'html.parser')
+ video = s.find("video")
+
+ if video is None:
+ return False # could not find video
+
+ video_url = video.get('src')
+ key = video_url.split('/')[-1].split('?')[0]
+ filename = 'tmp/' + key
+
+ if check_if_exists:
+ try:
+ self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+
+ # file exists
+ cdn_url = self.get_cdn_url(key)
+
+ status = 'already archived'
+
+ except ClientError:
+ pass
+
+ v = requests.get(video_url, headers=headers)
+
+ with open(filename, 'wb') as f:
+ f.write(v.content)
+
+ if status != 'already archived':
+ cdn_url = self.get_cdn_url(key)
+
+ with open(filename, 'rb') as f:
+ self.do_s3_upload(f, key)
+
+ # extract duration from HTML
+ duration = s.find_all('time')[0].contents[0]
+ if ':' in duration:
+ duration = float(duration.split(
+ ':')[0]) * 60 + float(duration.split(':')[1])
+ else:
+ duration = float(duration)
+
+ # process thumbnails
+ key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+ os.remove(filename)
+
+ return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
+ duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py
new file mode 100644
index 0000000..1e3bcaf
--- /dev/null
+++ b/archivers/tiktok_archiver.py
@@ -0,0 +1,68 @@
+import os, traceback
+from botocore.errorfactory import ClientError
+import tiktok_downloader
+from loguru import logger
+from .base_archiver import Archiver, ArchiveResult
+
+# TODO: get_cdn_url, do_s3_upload, get_thumbnails
+
+
+class TiktokArchiver(Archiver):
+ name = "tiktok"
+
+ def download(self, url, check_if_exists=False):
+ if 'tiktok.com' not in url:
+ return False
+
+ status = 'success'
+
+ try:
+ info = tiktok_downloader.info_post(url)
+ key = 'tiktok_' + str(info.id) + '.mp4'
+ filename = 'tmp/' + key
+
+ if check_if_exists:
+ try:
+ self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+
+ # file exists
+ cdn_url = self.get_cdn_url(key)
+
+ status = 'already archived'
+
+ except ClientError:
+ pass
+
+ if status != 'already archived':
+ media = tiktok_downloader.snaptik(url).get_media()
+ if len(media) > 0:
+ media[0].download(filename)
+ with open(filename, 'rb') as f:
+ self.do_s3_upload(f, key)
+
+ cdn_url = self.get_cdn_url(key)
+ else:
+ status = 'could not download media'
+
+ try:
+ key_thumb, thumb_index = self.get_thumbnails(
+ filename, duration=info.duration)
+ except:
+ key_thumb = ''
+ thumb_index = 'error creating thumbnails'
+
+ try: os.remove(filename)
+ except FileNotFoundError:
+ logger.info(f'tmp file not found thus not deleted {filename}')
+
+ return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb,
+ thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat())
+
+ except tiktok_downloader.Except.InvalidUrl:
+ status = 'Invalid URL'
+ return ArchiveResult(status=status)
+
+ except:
+ error = traceback.format_exc()
+ status = 'Other Tiktok error: ' + str(error)
+ return ArchiveResult(status=status)
diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py
new file mode 100644
index 0000000..a021324
--- /dev/null
+++ b/archivers/wayback_archiver.py
@@ -0,0 +1,73 @@
+import time, requests, os
+from bs4 import BeautifulSoup
+
+from .base_archiver import Archiver, ArchiveResult
+
+
+class WaybackArchiver(Archiver):
+ name = "wayback"
+
+ def __init__(self, s3_client):
+ self.s3 = s3_client
+ self.seen_urls = {}
+
+ def download(self, url, check_if_exists=False):
+ if check_if_exists and url in self.seen_urls:
+ return self.seen_urls[url]
+
+ ia_headers = {
+ "Accept": "application/json",
+ "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET')
+ }
+
+ r = requests.post(
+ 'https://web.archive.org/save/', headers=ia_headers, data={'url': url})
+
+ if r.status_code != 200:
+ return ArchiveResult(status="Internet archive failed")
+
+ job_id = r.json()['job_id']
+
+ status_r = requests.get(
+ 'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
+
+ retries = 0
+
+ # wait 90-120 seconds for the archive job to finish
+ while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
+ time.sleep(3)
+
+ try:
+ status_r = requests.get(
+ 'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
+ except:
+ time.sleep(1)
+
+ retries += 1
+
+ if status_r.status_code != 200:
+ return ArchiveResult(status="Internet archive failed")
+
+ status_json = status_r.json()
+
+ if status_json['status'] != 'success':
+ return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
+
+ archive_url = 'https://web.archive.org/web/' + \
+ status_json['timestamp'] + '/' + status_json['original_url']
+
+ try:
+ r = requests.get(archive_url)
+
+ parsed = BeautifulSoup(
+ r.content, 'html.parser')
+
+ title = parsed.find_all('title')[
+ 0].text
+ except:
+ title = "Could not get title"
+
+ result = ArchiveResult(
+ status='Internet Archive fallback', cdn_url=archive_url, title=title)
+ self.seen_urls[url] = result
+ return result
diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py
new file mode 100644
index 0000000..8249cfa
--- /dev/null
+++ b/archivers/youtubedl_archiver.py
@@ -0,0 +1,88 @@
+
+import os
+import datetime
+import youtube_dl
+from loguru import logger
+from botocore.errorfactory import ClientError
+from .base_archiver import Archiver, ArchiveResult
+
+class YoutubeDLArchiver(Archiver):
+ name = "yotube_dl"
+
+ def download(self, url, check_if_exists=False):
+ ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
+ if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
+ logger.info('Using Facebook cookie')
+ youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE')
+
+ ydl = youtube_dl.YoutubeDL(ydl_opts)
+ cdn_url = None
+ status = 'success'
+
+ try:
+ info = ydl.extract_info(url, download=False)
+ except youtube_dl.utils.DownloadError:
+ # no video here
+ return False
+
+ if 'is_live' in info and info['is_live']:
+ logger.warning("Live streaming media, not archiving now")
+ return ArchiveResult(status="Streaming media")
+
+ if check_if_exists:
+ if 'entries' in info:
+ if len(info['entries']) > 1:
+ logger.warning(
+ 'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+ return False
+
+ filename = ydl.prepare_filename(info['entries'][0])
+ else:
+ filename = ydl.prepare_filename(info)
+
+ key = self.get_key(filename)
+
+ try:
+ self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+
+ # file exists
+ cdn_url = self.get_cdn_url(key)
+
+ status = 'already archived'
+
+ except ClientError:
+ pass
+
+ # sometimes this results in a different filename, so do this again
+ info = ydl.extract_info(url, download=True)
+
+ if 'entries' in info:
+ if len(info['entries']) > 1:
+ logger.warning(
+ 'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+ return False
+ else:
+ info = info['entries'][0]
+
+ filename = ydl.prepare_filename(info)
+
+ if not os.path.exists(filename):
+ filename = filename.split('.')[0] + '.mkv'
+
+ if status != 'already archived':
+ key = self. get_key(filename)
+ cdn_url = self.get_cdn_url(key)
+
+ with open(filename, 'rb') as f:
+ self.do_s3_upload(f, key)
+
+ # get duration
+ duration = info['duration'] if 'duration' in info else None
+
+ # get thumbnails
+ key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+ os.remove(filename)
+
+ return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
+ title=info['title'] if 'title' in info else None,
+ timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
diff --git a/auto_archive.py b/auto_archive.py
index ef4f89c..c478463 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -1,14 +1,12 @@
-from dataclasses import dataclass
-import gspread
-from pathlib import Path
-import datetime
-import boto3
import os
-from dotenv import load_dotenv
+import datetime
import argparse
import math
-import threading
+import gspread
+import boto3
from loguru import logger
+from dotenv import load_dotenv
+
import archivers
load_dotenv()
@@ -156,6 +154,7 @@ def process_sheet(sheet):
'duration')) if 'duration' in headers else None
+ # order matters, first to succeed excludes remaining
active_archivers = [
archivers.TelegramArchiver(s3_client),
archivers.TiktokArchiver(s3_client),
@@ -198,7 +197,7 @@ def process_sheet(sheet):
def main():
parser = argparse.ArgumentParser(
- description="Automatically use youtube-dl to download media from a Google Sheet")
+ description="Automatically archive social media videos from a Google Sheet")
parser.add_argument("--sheet", action="store", dest="sheet")
args = parser.parse_args()