From f3ce22666562bed2780181dbef95b8dee5a5e69e Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 21 Feb 2022 14:19:09 +0100 Subject: [PATCH] split into multiple files MVP --- .gitignore | 3 +- Pipfile | 1 - Pipfile.lock | 145 +----------- README.md | 2 + archivers.py | 390 -------------------------------- archivers/__init__.py | 6 + archivers/base_archiver.py | 115 ++++++++++ archivers/telegram_archiver.py | 76 +++++++ archivers/tiktok_archiver.py | 68 ++++++ archivers/wayback_archiver.py | 73 ++++++ archivers/youtubedl_archiver.py | 88 +++++++ auto_archive.py | 15 +- 12 files changed, 446 insertions(+), 536 deletions(-) delete mode 100644 archivers.py create mode 100644 archivers/__init__.py create mode 100644 archivers/base_archiver.py create mode 100644 archivers/telegram_archiver.py create mode 100644 archivers/tiktok_archiver.py create mode 100644 archivers/wayback_archiver.py create mode 100644 archivers/youtubedl_archiver.py diff --git a/.gitignore b/.gitignore index b6a6b68..5d7eec9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ tmp/ -.env +.env* .DS_Store expmt/ service_account.json __pycache__/ ._* +anu.html \ No newline at end of file diff --git a/Pipfile b/Pipfile index 0d954c9..27071fa 100644 --- a/Pipfile +++ b/Pipfile @@ -10,7 +10,6 @@ python-dotenv = "*" youtube_dl = "*" argparse = "*" beautifulsoup4 = "*" -nordvpn-switcher = "*" tiktok-downloader = {git = "https://github.com/msramalho/tiktok-downloader"} bs4 = "*" loguru = "*" diff --git a/Pipfile.lock b/Pipfile.lock index b354d59..9879884 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "af39efbad8c78641a732697001193b5f4f92a0af8a9709081428001362a47060" + "sha256": "9a5218275503e5ae779407349d0a76f44712dc4824e066b10aeb047264a168be" }, "pipfile-spec": 6, "requires": { @@ -93,6 +93,14 @@ ], "version": "==1.2.58" }, + "faker": { + "hashes": [ + "sha256:ee8d9181137cdd2b198bd3d0653b0a3b7b385213862348e15ba8a423324b702b", + "sha256:f545b2a1ba5f7effc4ed71af0a5204d939445f0190838d41bee6bc160958bfbe" + ], + "markers": "python_version >= '3.6'", + "version": "==13.0.0" + }, "ffmpeg-python": { "hashes": [ "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127", @@ -180,73 +188,6 @@ "index": "pypi", "version": "==0.6.0" }, - "lxml": { - "hashes": [ - "sha256:078306d19a33920004addeb5f4630781aaeabb6a8d01398045fcde085091a169", - "sha256:0c1978ff1fd81ed9dcbba4f91cf09faf1f8082c9d72eb122e92294716c605428", - "sha256:1010042bfcac2b2dc6098260a2ed022968dbdfaf285fc65a3acf8e4eb1ffd1bc", - "sha256:1d650812b52d98679ed6c6b3b55cbb8fe5a5460a0aef29aeb08dc0b44577df85", - "sha256:20b8a746a026017acf07da39fdb10aa80ad9877046c9182442bf80c84a1c4696", - "sha256:2403a6d6fb61c285969b71f4a3527873fe93fd0abe0832d858a17fe68c8fa507", - "sha256:24f5c5ae618395ed871b3d8ebfcbb36e3f1091fd847bf54c4de623f9107942f3", - "sha256:28d1af847786f68bec57961f31221125c29d6f52d9187c01cd34dc14e2b29430", - "sha256:31499847fc5f73ee17dbe1b8e24c6dafc4e8d5b48803d17d22988976b0171f03", - "sha256:31ba2cbc64516dcdd6c24418daa7abff989ddf3ba6d3ea6f6ce6f2ed6e754ec9", - "sha256:330bff92c26d4aee79c5bc4d9967858bdbe73fdbdbacb5daf623a03a914fe05b", - "sha256:5045ee1ccd45a89c4daec1160217d363fcd23811e26734688007c26f28c9e9e7", - "sha256:52cbf2ff155b19dc4d4100f7442f6a697938bf4493f8d3b0c51d45568d5666b5", - "sha256:530f278849031b0eb12f46cca0e5db01cfe5177ab13bd6878c6e739319bae654", - "sha256:545bd39c9481f2e3f2727c78c169425efbfb3fbba6e7db4f46a80ebb249819ca", - "sha256:5804e04feb4e61babf3911c2a974a5b86f66ee227cc5006230b00ac6d285b3a9", - "sha256:5a58d0b12f5053e270510bf12f753a76aaf3d74c453c00942ed7d2c804ca845c", - "sha256:5f148b0c6133fb928503cfcdfdba395010f997aa44bcf6474fcdd0c5398d9b63", - "sha256:5f7d7d9afc7b293147e2d506a4596641d60181a35279ef3aa5778d0d9d9123fe", - "sha256:60d2f60bd5a2a979df28ab309352cdcf8181bda0cca4529769a945f09aba06f9", - "sha256:6259b511b0f2527e6d55ad87acc1c07b3cbffc3d5e050d7e7bcfa151b8202df9", - "sha256:6268e27873a3d191849204d00d03f65c0e343b3bcb518a6eaae05677c95621d1", - "sha256:627e79894770783c129cc5e89b947e52aa26e8e0557c7e205368a809da4b7939", - "sha256:62f93eac69ec0f4be98d1b96f4d6b964855b8255c345c17ff12c20b93f247b68", - "sha256:6d6483b1229470e1d8835e52e0ff3c6973b9b97b24cd1c116dca90b57a2cc613", - "sha256:6f7b82934c08e28a2d537d870293236b1000d94d0b4583825ab9649aef7ddf63", - "sha256:6fe4ef4402df0250b75ba876c3795510d782def5c1e63890bde02d622570d39e", - "sha256:719544565c2937c21a6f76d520e6e52b726d132815adb3447ccffbe9f44203c4", - "sha256:730766072fd5dcb219dd2b95c4c49752a54f00157f322bc6d71f7d2a31fecd79", - "sha256:74eb65ec61e3c7c019d7169387d1b6ffcfea1b9ec5894d116a9a903636e4a0b1", - "sha256:7993232bd4044392c47779a3c7e8889fea6883be46281d45a81451acfd704d7e", - "sha256:80bbaddf2baab7e6de4bc47405e34948e694a9efe0861c61cdc23aa774fcb141", - "sha256:86545e351e879d0b72b620db6a3b96346921fa87b3d366d6c074e5a9a0b8dadb", - "sha256:891dc8f522d7059ff0024cd3ae79fd224752676447f9c678f2a5c14b84d9a939", - "sha256:8a31f24e2a0b6317f33aafbb2f0895c0bce772980ae60c2c640d82caac49628a", - "sha256:8b99ec73073b37f9ebe8caf399001848fced9c08064effdbfc4da2b5a8d07b93", - "sha256:986b7a96228c9b4942ec420eff37556c5777bfba6758edcb95421e4a614b57f9", - "sha256:a1547ff4b8a833511eeaceacbcd17b043214fcdb385148f9c1bc5556ca9623e2", - "sha256:a2bfc7e2a0601b475477c954bf167dee6d0f55cb167e3f3e7cefad906e7759f6", - "sha256:a3c5f1a719aa11866ffc530d54ad965063a8cbbecae6515acbd5f0fae8f48eaa", - "sha256:a9f1c3489736ff8e1c7652e9dc39f80cff820f23624f23d9eab6e122ac99b150", - "sha256:aa0cf4922da7a3c905d000b35065df6184c0dc1d866dd3b86fd961905bbad2ea", - "sha256:ad4332a532e2d5acb231a2e5d33f943750091ee435daffca3fec0a53224e7e33", - "sha256:b2582b238e1658c4061ebe1b4df53c435190d22457642377fd0cb30685cdfb76", - "sha256:b6fc2e2fb6f532cf48b5fed57567ef286addcef38c28874458a41b7837a57807", - "sha256:b92d40121dcbd74831b690a75533da703750f7041b4bf951befc657c37e5695a", - "sha256:bbab6faf6568484707acc052f4dfc3802bdb0cafe079383fbaa23f1cdae9ecd4", - "sha256:c0b88ed1ae66777a798dc54f627e32d3b81c8009967c63993c450ee4cbcbec15", - "sha256:ce13d6291a5f47c1c8dbd375baa78551053bc6b5e5c0e9bb8e39c0a8359fd52f", - "sha256:db3535733f59e5605a88a706824dfcb9bd06725e709ecb017e165fc1d6e7d429", - "sha256:dd10383f1d6b7edf247d0960a3db274c07e96cf3a3fc7c41c8448f93eac3fb1c", - "sha256:e01f9531ba5420838c801c21c1b0f45dbc9607cb22ea2cf132844453bec863a5", - "sha256:e11527dc23d5ef44d76fef11213215c34f36af1608074561fcc561d983aeb870", - "sha256:e1ab2fac607842ac36864e358c42feb0960ae62c34aa4caaf12ada0a1fb5d99b", - "sha256:e1fd7d2fe11f1cb63d3336d147c852f6d07de0d0020d704c6031b46a30b02ca8", - "sha256:e9f84ed9f4d50b74fbc77298ee5c870f67cb7e91dcdc1a6915cb1ff6a317476c", - "sha256:ec4b4e75fc68da9dc0ed73dcdb431c25c57775383fec325d23a770a64e7ebc87", - "sha256:f10ce66fcdeb3543df51d423ede7e238be98412232fca5daec3e54bcd16b8da0", - "sha256:f63f62fc60e6228a4ca9abae28228f35e1bd3ce675013d1dfb828688d50c6e23", - "sha256:fa56bb08b3dd8eac3a8c5b7d075c94e74f755fd9d8a04543ae8d37b1612dd170", - "sha256:fa9b7c450be85bfc6cd39f6df8c5b8cbd76b5d6fc1f69efec80203f9894b885f" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==4.8.0" - }, "markupsafe": { "hashes": [ "sha256:023af8c54fe63530545f70dd2a2a7eed18d07a9a77b94e8bf1e2ff7f252db9a3", @@ -293,14 +234,6 @@ "markers": "python_version >= '3.7'", "version": "==2.1.0" }, - "nordvpn-switcher": { - "hashes": [ - "sha256:764db054715d949af0f836da5e46c4053afe92282a0d4b2cfc6b8cfe8c3045de", - "sha256:9788c2c3113d0d7b00894dae3ea19bed14f3b38d111d7223c126f001b1729a3b" - ], - "index": "pypi", - "version": "==0.2.9" - }, "oauthlib": { "hashes": [ "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2", @@ -309,59 +242,6 @@ "markers": "python_version >= '3.6'", "version": "==3.2.0" }, - "pathlib": { - "hashes": [ - "sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f" - ], - "version": "==1.0.1" - }, - "psutil": { - "hashes": [ - "sha256:072664401ae6e7c1bfb878c65d7282d4b4391f1bc9a56d5e03b5a490403271b5", - "sha256:1070a9b287846a21a5d572d6dddd369517510b68710fca56b0e9e02fd24bed9a", - "sha256:1d7b433519b9a38192dfda962dd8f44446668c009833e1429a52424624f408b4", - "sha256:3151a58f0fbd8942ba94f7c31c7e6b310d2989f4da74fcbf28b934374e9bf841", - "sha256:32acf55cb9a8cbfb29167cd005951df81b567099295291bcfd1027365b36591d", - "sha256:3611e87eea393f779a35b192b46a164b1d01167c9d323dda9b1e527ea69d697d", - "sha256:3d00a664e31921009a84367266b35ba0aac04a2a6cad09c550a89041034d19a0", - "sha256:4e2fb92e3aeae3ec3b7b66c528981fd327fb93fd906a77215200404444ec1845", - "sha256:539e429da49c5d27d5a58e3563886057f8fc3868a5547b4f1876d9c0f007bccf", - "sha256:55ce319452e3d139e25d6c3f85a1acf12d1607ddedea5e35fb47a552c051161b", - "sha256:58c7d923dc209225600aec73aa2c4ae8ea33b1ab31bc11ef8a5933b027476f07", - "sha256:7336292a13a80eb93c21f36bde4328aa748a04b68c13d01dfddd67fc13fd0618", - "sha256:742c34fff804f34f62659279ed5c5b723bb0195e9d7bd9907591de9f8f6558e2", - "sha256:7641300de73e4909e5d148e90cc3142fb890079e1525a840cf0dfd39195239fd", - "sha256:76cebf84aac1d6da5b63df11fe0d377b46b7b500d892284068bacccf12f20666", - "sha256:7779be4025c540d1d65a2de3f30caeacc49ae7a2152108adeaf42c7534a115ce", - "sha256:7d190ee2eaef7831163f254dc58f6d2e2a22e27382b936aab51c835fc080c3d3", - "sha256:8293942e4ce0c5689821f65ce6522ce4786d02af57f13c0195b40e1edb1db61d", - "sha256:869842dbd66bb80c3217158e629d6fceaecc3a3166d3d1faee515b05dd26ca25", - "sha256:90a58b9fcae2dbfe4ba852b57bd4a1dded6b990a33d6428c7614b7d48eccb492", - "sha256:9b51917c1af3fa35a3f2dabd7ba96a2a4f19df3dec911da73875e1edaf22a40b", - "sha256:b2237f35c4bbae932ee98902a08050a27821f8f6dfa880a47195e5993af4702d", - "sha256:c3400cae15bdb449d518545cbd5b649117de54e3596ded84aacabfbb3297ead2", - "sha256:c51f1af02334e4b516ec221ee26b8fdf105032418ca5a5ab9737e8c87dafe203", - "sha256:cb8d10461c1ceee0c25a64f2dd54872b70b89c26419e147a05a10b753ad36ec2", - "sha256:d62a2796e08dd024b8179bd441cb714e0f81226c352c802fca0fd3f89eeacd94", - "sha256:df2c8bd48fb83a8408c8390b143c6a6fa10cb1a674ca664954de193fdcab36a9", - "sha256:e5c783d0b1ad6ca8a5d3e7b680468c9c926b804be83a3a8e95141b05c39c9f64", - "sha256:e9805fed4f2a81de98ae5fe38b75a74c6e6ad2df8a5c479594c7629a1fe35f56", - "sha256:ea42d747c5f71b5ccaa6897b216a7dadb9f52c72a0fe2b872ef7d3e1eacf3ba3", - "sha256:ef216cc9feb60634bda2f341a9559ac594e2eeaadd0ba187a4c2eb5b5d40b91c", - "sha256:ff0d41f8b3e9ebb6b6110057e40019a432e96aae2008951121ba4e56040b84f3" - ], - "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==5.9.0" - }, - "py-mini-racer": { - "hashes": [ - "sha256:346e73bb89a2024888244d487834be24a121089ceb0641dd0200cb96c4e24b57", - "sha256:42896c24968481dd953eeeb11de331f6870917811961c9b26ba09071e07180e2", - "sha256:97cab31bbf63ce462ba4cd6e978c572c916d8b15586156c7c5e0b2e42c10baab", - "sha256:f71e36b643d947ba698c57cd9bd2232c83ca997b0802fc2f7f79582377040c11" - ], - "version": "==0.6.0" - }, "pyasn1": { "hashes": [ "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359", @@ -422,13 +302,6 @@ "index": "pypi", "version": "==0.19.2" }, - "random-user-agent": { - "hashes": [ - "sha256:535636a55fb63fe3d74fd0260d854c241d9f2946447026464e578e68eac17dac", - "sha256:8f8ca26ec8cb1d24ad1758d8b8f700d154064d641dbe9a255cfec42960fbd012" - ], - "version": "==1.0.1" - }, "requests": { "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", diff --git a/README.md b/README.md index 2e40bcc..cec6e9a 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ If you are using `pipenv` (recommended), `pipenv install` is sufficient to insta [A Google Service account is necessary for use with `gspread`.](https://gspread.readthedocs.io/en/latest/oauth2.html#for-bots-using-service-account) Credentials for this account should be stored in `service_account.json`, in the same directory as the script. +[ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work. + A `.env` file is required for saving content to a Digital Ocean space, and for archiving pages to the Internet Archive. This file should also be in the script directory, and should contain the following variables: ``` diff --git a/archivers.py b/archivers.py deleted file mode 100644 index d8a72f6..0000000 --- a/archivers.py +++ /dev/null @@ -1,390 +0,0 @@ -from dataclasses import dataclass -import youtube_dl -from bs4 import BeautifulSoup -import requests -import tiktok_downloader -from loguru import logger -import os -import datetime -import ffmpeg -from botocore.errorfactory import ClientError -import time -import traceback - -# TODO There should be a better way of generating keys, that adds the following info: -# - name of sheet that it is being archived from -# (this means we might archive the same media twice on different sheets, but that's OK I think) -# - name of archiver/platform that the video comes from -# This should make it easier to maintain and clean the archive later - -# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be -# cleaned up? Difficult is we don't know the filename until the archivers start working. - - -def get_cdn_url(key): - return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format( - os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key) - - -def do_s3_upload(s3_client, f, key): - s3_client.upload_fileobj(f, Bucket=os.getenv( - 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'}) - - -def get_key(filename): - key = filename.split('/')[1] - if 'unknown_video' in key: - key = key.replace('unknown_video', 'jpg') - return key - - -def get_thumbnails(filename, s3_client, duration=None): - if not os.path.exists(filename.split('.')[0]): - os.mkdir(filename.split('.')[0]) - - fps = 0.5 - if duration is not None: - duration = float(duration) - - if duration < 60: - fps = 10.0 / duration - elif duration < 120: - fps = 20.0 / duration - else: - fps = 40.0 / duration - - stream = ffmpeg.input(filename) - stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1) - stream.output(filename.split('.')[0] + '/out%d.jpg').run() - - thumbnails = os.listdir(filename.split('.')[0] + '/') - cdn_urls = [] - - for fname in thumbnails: - if fname[-3:] == 'jpg': - thumbnail_filename = filename.split('.')[0] + '/' + fname - key = filename.split('/')[1].split('.')[0] + '/' + fname - - cdn_url = get_cdn_url(key) - - with open(thumbnail_filename, 'rb') as f: - do_s3_upload(s3_client, f, key) - - cdn_urls.append(cdn_url) - os.remove(thumbnail_filename) - - if len(cdn_urls) == 0: - return ('None', 'None') - - key_thumb = cdn_urls[int(len(cdn_urls)*0.1)] - - index_page = f'''{filename} - ''' - - for t in cdn_urls: - index_page += f'' - - index_page += f"" - index_fname = filename.split('.')[0] + '/index.html' - - with open(index_fname, 'w') as f: - f.write(index_page) - - thumb_index = filename.split('/')[1].split('.')[0] + '/index.html' - - s3_client.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv( - 'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'}) - - thumb_index_cdn_url = get_cdn_url(thumb_index) - - return (key_thumb, thumb_index_cdn_url) - - -@dataclass -class ArchiveResult: - status: str - cdn_url: str = None - thumbnail: str = None - thumbnail_index: str = None - duration: float = None - title: str = None - timestamp: datetime.datetime = None - - -class Archiver: - def __init__(self, s3_client): - self.s3 = s3_client - - def download(self, url): - pass - - -class TelegramArchiver(Archiver): - def download(self, url, check_if_exists=False): - # detect URLs that we definitely cannot handle - if 'http://t.me/' not in url and 'https://t.me/' not in url: - return False - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'} - status = "success" - - original_url = url - - if url[-8:] != "?embed=1": - url += "?embed=1" - - t = requests.get(url, headers=headers) - s = BeautifulSoup(t.content, 'html.parser') - video = s.find("video") - - if video is None: - return False # could not find video - - video_url = video.get('src') - key = video_url.split('/')[-1].split('?')[0] - filename = 'tmp/' + key - - if check_if_exists: - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass - - v = requests.get(video_url, headers=headers) - - with open(filename, 'wb') as f: - f.write(v.content) - - if status != 'already archived': - cdn_url = get_cdn_url(key) - - with open(filename, 'rb') as f: - do_s3_upload(self.s3, f, key) - - # extract duration from HTML - duration = s.find_all('time')[0].contents[0] - if ':' in duration: - duration = float(duration.split( - ':')[0])*60 + float(duration.split(':')[1]) - else: - duration = float(duration) - - # process thumbnails - key_thumb, thumb_index = get_thumbnails( - filename, self.s3, duration=duration) - os.remove(filename) - - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, - duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime')) - - -class YoutubeDLArchiver(Archiver): - def download(self, url, check_if_exists=False): - ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} - if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): - logger.info('Using Facebook cookie') - youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') - - ydl = youtube_dl.YoutubeDL(ydl_opts) - cdn_url = None - status = 'success' - - try: - info = ydl.extract_info(url, download=False) - except youtube_dl.utils.DownloadError: - # no video here - return False - - if 'is_live' in info and info['is_live']: - logger.warning("Live streaming media, not archiving now") - return ArchiveResult(status="Streaming media") - - if check_if_exists: - if 'entries' in info: - if len(info['entries']) > 1: - logger.warning( - 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') - return False - - filename = ydl.prepare_filename(info['entries'][0]) - else: - filename = ydl.prepare_filename(info) - - key = get_key(filename) - - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass - - # sometimes this results in a different filename, so do this again - info = ydl.extract_info(url, download=True) - - if 'entries' in info: - if len(info['entries']) > 1: - logger.warning( - 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') - return False - else: - info = info['entries'][0] - - filename = ydl.prepare_filename(info) - - if not os.path.exists(filename): - filename = filename.split('.')[0] + '.mkv' - - if status != 'already archived': - key = get_key(filename) - cdn_url = get_cdn_url(key) - - with open(filename, 'rb') as f: - do_s3_upload(self.s3, f, key) - - # get duration - duration = info['duration'] if 'duration' in info else None - - # get thumbnails - key_thumb, thumb_index = get_thumbnails( - filename, self.s3, duration=duration) - os.remove(filename) - - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, - title=info['title'] if 'title' in info else None, - timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None) - - -class WaybackArchiver(Archiver): - def __init__(self, s3_client): - self.s3 = s3_client - self.seen_urls = {} - - def download(self, url, check_if_exists=False): - if check_if_exists and url in self.seen_urls: - return self.seen_urls[url] - - ia_headers = { - "Accept": "application/json", - "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET') - } - - r = requests.post( - 'https://web.archive.org/save/', headers=ia_headers, data={'url': url}) - - if r.status_code != 200: - return ArchiveResult(status="Internet archive failed") - - job_id = r.json()['job_id'] - - status_r = requests.get( - 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) - - retries = 0 - - # wait 90-120 seconds for the archive job to finish - while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30: - time.sleep(3) - - try: - status_r = requests.get( - 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) - except: - time.sleep(1) - - retries += 1 - - if status_r.status_code != 200: - return ArchiveResult(status="Internet archive failed") - - status_json = status_r.json() - - if status_json['status'] != 'success': - return ArchiveResult(status='Internet Archive failed: ' + status_json['message']) - - archive_url = 'https://web.archive.org/web/' + \ - status_json['timestamp'] + '/' + status_json['original_url'] - - try: - r = requests.get(archive_url) - - parsed = BeautifulSoup( - r.content, 'html.parser') - - title = parsed.find_all('title')[ - 0].text - except: - title = "Could not get title" - - result = ArchiveResult( - status='Internet Archive fallback', cdn_url=archive_url, title=title) - self.seen_urls[url] = result - return result - - -class TiktokArchiver(Archiver): - def download(self, url, check_if_exists=False): - if 'tiktok.com' not in url: - return False - - status = 'success' - - try: - info = tiktok_downloader.info_post(url) - key = 'tiktok_' + str(info.id) + '.mp4' - filename = 'tmp/' + key - - if check_if_exists: - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass - - if status != 'already archived': - media = tiktok_downloader.snaptik(url).get_media() - if len(media) > 0: - media[0].download(filename) - with open(filename, 'rb') as f: - do_s3_upload(self.s3, f, key) - - cdn_url = get_cdn_url(key) - else: - status = 'could not download media' - - try: - key_thumb, thumb_index = get_thumbnails( - filename, self.s3, duration=info.duration) - except: - key_thumb = '' - thumb_index = 'error creating thumbnails' - - os.remove(filename) - - return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, - thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat()) - - except tiktok_downloader.Except.InvalidUrl: - status = 'Invalid URL' - return ArchiveResult(status=status) - - except: - error = traceback.format_exc() - status = 'Other Tiktok error: ' + str(error) - return ArchiveResult(status=status) diff --git a/archivers/__init__.py b/archivers/__init__.py new file mode 100644 index 0000000..e6c4ba6 --- /dev/null +++ b/archivers/__init__.py @@ -0,0 +1,6 @@ +# we need to explicitly expose the available imports here +from .base_archiver import * +from .telegram_archiver import * +from .tiktok_archiver import * +from .wayback_archiver import * +from .youtubedl_archiver import * \ No newline at end of file diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py new file mode 100644 index 0000000..3f9f4ac --- /dev/null +++ b/archivers/base_archiver.py @@ -0,0 +1,115 @@ +import os +import ffmpeg +from dataclasses import dataclass +import datetime +from loguru import logger + +# TODO There should be a better way of generating keys, that adds the following info: +# - name of sheet that it is being archived from +# (this means we might archive the same media twice on different sheets, but that's OK I think) +# - name of archiver/platform that the video comes from +# This should make it easier to maintain and clean the archive later + +# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be +# cleaned up? Difficult is we don't know the filename until the archivers start working. + + +@dataclass +class ArchiveResult: + status: str + cdn_url: str = None + thumbnail: str = None + thumbnail_index: str = None + duration: float = None + title: str = None + timestamp: datetime.datetime = None + + +class Archiver: + name = "default" + + def __init__(self, s3_client): + self.s3 = s3_client + + def __str__(self): + return self.__class__.__name__ + + def download(self, url, check_if_exists=False): + logger.error("method 'download' not implemented") + + def get_cdn_url(self, key): + return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format( + os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key) + + def do_s3_upload(self, f, key): + self.s3.upload_fileobj(f, Bucket=os.getenv( + 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'}) + + def get_key(self, filename): + print(f"key base implementation: {self.name}") + # TODO: refactor to be more manageable + key = filename.split('/')[1] + if 'unknown_video' in key: + key = key.replace('unknown_video', 'jpg') + return key + + def get_thumbnails(self, filename, duration=None): + if not os.path.exists(filename.split('.')[0]): + os.mkdir(filename.split('.')[0]) + + fps = 0.5 + if duration is not None: + duration = float(duration) + + if duration < 60: + fps = 10.0 / duration + elif duration < 120: + fps = 20.0 / duration + else: + fps = 40.0 / duration + + stream = ffmpeg.input(filename) + stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1) + stream.output(filename.split('.')[0] + '/out%d.jpg').run() + + thumbnails = os.listdir(filename.split('.')[0] + '/') + cdn_urls = [] + + for fname in thumbnails: + if fname[-3:] == 'jpg': + thumbnail_filename = filename.split('.')[0] + '/' + fname + key = filename.split('/')[1].split('.')[0] + '/' + fname + + cdn_url = self.get_cdn_url(key) + + with open(thumbnail_filename, 'rb') as f: + self.do_s3_upload(f, key) + + cdn_urls.append(cdn_url) + os.remove(thumbnail_filename) + + if len(cdn_urls) == 0: + return ('None', 'None') + + key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)] + + index_page = f'''{filename} + ''' + + for t in cdn_urls: + index_page += f'' + + index_page += f"" + index_fname = filename.split('.')[0] + '/index.html' + + with open(index_fname, 'w') as f: + f.write(index_page) + + thumb_index = filename.split('/')[1].split('.')[0] + '/index.html' + + self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv( + 'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'}) + + thumb_index_cdn_url = self.get_cdn_url(thumb_index) + + return (key_thumb, thumb_index_cdn_url) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py new file mode 100644 index 0000000..d9168e4 --- /dev/null +++ b/archivers/telegram_archiver.py @@ -0,0 +1,76 @@ +import os +import requests +from bs4 import BeautifulSoup +from botocore.errorfactory import ClientError +from .base_archiver import Archiver, ArchiveResult + +# TODO: get_cdn_url, get_thumbnails, do_s3_upload + + +class TelegramArchiver(Archiver): + name = "telegram" + + def download(self, url, check_if_exists=False): + # detect URLs that we definitely cannot handle + if 'http://t.me/' not in url and 'https://t.me/' not in url: + return False + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' + } + status = "success" + + original_url = url + + # TODO: check if we can do this more resilient to user-input + if url[-8:] != "?embed=1": + url += "?embed=1" + + t = requests.get(url, headers=headers) + s = BeautifulSoup(t.content, 'html.parser') + video = s.find("video") + + if video is None: + return False # could not find video + + video_url = video.get('src') + key = video_url.split('/')[-1].split('?')[0] + filename = 'tmp/' + key + + if check_if_exists: + try: + self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) + + # file exists + cdn_url = self.get_cdn_url(key) + + status = 'already archived' + + except ClientError: + pass + + v = requests.get(video_url, headers=headers) + + with open(filename, 'wb') as f: + f.write(v.content) + + if status != 'already archived': + cdn_url = self.get_cdn_url(key) + + with open(filename, 'rb') as f: + self.do_s3_upload(f, key) + + # extract duration from HTML + duration = s.find_all('time')[0].contents[0] + if ':' in duration: + duration = float(duration.split( + ':')[0]) * 60 + float(duration.split(':')[1]) + else: + duration = float(duration) + + # process thumbnails + key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) + os.remove(filename) + + return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, + duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime')) diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py new file mode 100644 index 0000000..1e3bcaf --- /dev/null +++ b/archivers/tiktok_archiver.py @@ -0,0 +1,68 @@ +import os, traceback +from botocore.errorfactory import ClientError +import tiktok_downloader +from loguru import logger +from .base_archiver import Archiver, ArchiveResult + +# TODO: get_cdn_url, do_s3_upload, get_thumbnails + + +class TiktokArchiver(Archiver): + name = "tiktok" + + def download(self, url, check_if_exists=False): + if 'tiktok.com' not in url: + return False + + status = 'success' + + try: + info = tiktok_downloader.info_post(url) + key = 'tiktok_' + str(info.id) + '.mp4' + filename = 'tmp/' + key + + if check_if_exists: + try: + self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) + + # file exists + cdn_url = self.get_cdn_url(key) + + status = 'already archived' + + except ClientError: + pass + + if status != 'already archived': + media = tiktok_downloader.snaptik(url).get_media() + if len(media) > 0: + media[0].download(filename) + with open(filename, 'rb') as f: + self.do_s3_upload(f, key) + + cdn_url = self.get_cdn_url(key) + else: + status = 'could not download media' + + try: + key_thumb, thumb_index = self.get_thumbnails( + filename, duration=info.duration) + except: + key_thumb = '' + thumb_index = 'error creating thumbnails' + + try: os.remove(filename) + except FileNotFoundError: + logger.info(f'tmp file not found thus not deleted {filename}') + + return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, + thumbnail_index=thumb_index, duration=info.duration, title=info.caption, timestamp=info.create.isoformat()) + + except tiktok_downloader.Except.InvalidUrl: + status = 'Invalid URL' + return ArchiveResult(status=status) + + except: + error = traceback.format_exc() + status = 'Other Tiktok error: ' + str(error) + return ArchiveResult(status=status) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py new file mode 100644 index 0000000..a021324 --- /dev/null +++ b/archivers/wayback_archiver.py @@ -0,0 +1,73 @@ +import time, requests, os +from bs4 import BeautifulSoup + +from .base_archiver import Archiver, ArchiveResult + + +class WaybackArchiver(Archiver): + name = "wayback" + + def __init__(self, s3_client): + self.s3 = s3_client + self.seen_urls = {} + + def download(self, url, check_if_exists=False): + if check_if_exists and url in self.seen_urls: + return self.seen_urls[url] + + ia_headers = { + "Accept": "application/json", + "Authorization": "LOW " + os.getenv('INTERNET_ARCHIVE_S3_KEY') + ":" + os.getenv('INTERNET_ARCHIVE_S3_SECRET') + } + + r = requests.post( + 'https://web.archive.org/save/', headers=ia_headers, data={'url': url}) + + if r.status_code != 200: + return ArchiveResult(status="Internet archive failed") + + job_id = r.json()['job_id'] + + status_r = requests.get( + 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) + + retries = 0 + + # wait 90-120 seconds for the archive job to finish + while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30: + time.sleep(3) + + try: + status_r = requests.get( + 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) + except: + time.sleep(1) + + retries += 1 + + if status_r.status_code != 200: + return ArchiveResult(status="Internet archive failed") + + status_json = status_r.json() + + if status_json['status'] != 'success': + return ArchiveResult(status='Internet Archive failed: ' + status_json['message']) + + archive_url = 'https://web.archive.org/web/' + \ + status_json['timestamp'] + '/' + status_json['original_url'] + + try: + r = requests.get(archive_url) + + parsed = BeautifulSoup( + r.content, 'html.parser') + + title = parsed.find_all('title')[ + 0].text + except: + title = "Could not get title" + + result = ArchiveResult( + status='Internet Archive fallback', cdn_url=archive_url, title=title) + self.seen_urls[url] = result + return result diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py new file mode 100644 index 0000000..8249cfa --- /dev/null +++ b/archivers/youtubedl_archiver.py @@ -0,0 +1,88 @@ + +import os +import datetime +import youtube_dl +from loguru import logger +from botocore.errorfactory import ClientError +from .base_archiver import Archiver, ArchiveResult + +class YoutubeDLArchiver(Archiver): + name = "yotube_dl" + + def download(self, url, check_if_exists=False): + ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} + if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): + logger.info('Using Facebook cookie') + youtube_dl.utils.std_headers['cookie'] = os.getenv('FB_COOKIE') + + ydl = youtube_dl.YoutubeDL(ydl_opts) + cdn_url = None + status = 'success' + + try: + info = ydl.extract_info(url, download=False) + except youtube_dl.utils.DownloadError: + # no video here + return False + + if 'is_live' in info and info['is_live']: + logger.warning("Live streaming media, not archiving now") + return ArchiveResult(status="Streaming media") + + if check_if_exists: + if 'entries' in info: + if len(info['entries']) > 1: + logger.warning( + 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') + return False + + filename = ydl.prepare_filename(info['entries'][0]) + else: + filename = ydl.prepare_filename(info) + + key = self.get_key(filename) + + try: + self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) + + # file exists + cdn_url = self.get_cdn_url(key) + + status = 'already archived' + + except ClientError: + pass + + # sometimes this results in a different filename, so do this again + info = ydl.extract_info(url, download=True) + + if 'entries' in info: + if len(info['entries']) > 1: + logger.warning( + 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') + return False + else: + info = info['entries'][0] + + filename = ydl.prepare_filename(info) + + if not os.path.exists(filename): + filename = filename.split('.')[0] + '.mkv' + + if status != 'already archived': + key = self. get_key(filename) + cdn_url = self.get_cdn_url(key) + + with open(filename, 'rb') as f: + self.do_s3_upload(f, key) + + # get duration + duration = info['duration'] if 'duration' in info else None + + # get thumbnails + key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) + os.remove(filename) + + return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, + title=info['title'] if 'title' in info else None, + timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None) diff --git a/auto_archive.py b/auto_archive.py index ef4f89c..c478463 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -1,14 +1,12 @@ -from dataclasses import dataclass -import gspread -from pathlib import Path -import datetime -import boto3 import os -from dotenv import load_dotenv +import datetime import argparse import math -import threading +import gspread +import boto3 from loguru import logger +from dotenv import load_dotenv + import archivers load_dotenv() @@ -156,6 +154,7 @@ def process_sheet(sheet): 'duration')) if 'duration' in headers else None + # order matters, first to succeed excludes remaining active_archivers = [ archivers.TelegramArchiver(s3_client), archivers.TiktokArchiver(s3_client), @@ -198,7 +197,7 @@ def process_sheet(sheet): def main(): parser = argparse.ArgumentParser( - description="Automatically use youtube-dl to download media from a Google Sheet") + description="Automatically archive social media videos from a Google Sheet") parser.add_argument("--sheet", action="store", dest="sheet") args = parser.parse_args()