Cache Twitter tokens on disk from the CLI for reuse between scrapes

Closes #339
This commit is contained in:
JustAnotherArchivist
2022-01-05 02:20:40 +00:00
parent ca00b480b1
commit acb7f10a4f
2 changed files with 64 additions and 0 deletions

View File

@@ -28,6 +28,7 @@ setuptools.setup(
'lxml', 'lxml',
'beautifulsoup4', 'beautifulsoup4',
'pytz; python_version < "3.9.0"', 'pytz; python_version < "3.9.0"',
'filelock',
], ],
python_requires = '~=3.8', python_requires = '~=3.8',
extras_require = { extras_require = {

View File

@@ -20,10 +20,12 @@ import dataclasses
import datetime import datetime
import email.utils import email.utils
import enum import enum
import filelock
import itertools import itertools
import json import json
import random import random
import logging import logging
import os
import re import re
import snscrape.base import snscrape.base
import string import string
@@ -210,6 +212,62 @@ class GuestTokenManager:
self._setTime = 0.0 self._setTime = 0.0
class _CLIGuestTokenManager(GuestTokenManager):
def __init__(self):
super().__init__()
cacheHome = os.environ.get('XDG_CACHE_HOME')
if not cacheHome or not os.path.isabs(cacheHome):
# This should be ${HOME}/.cache, but the HOME environment variable may not exist on non-POSIX-compliant systems.
# On POSIX-compliant systems, the XDG Base Directory specification is followed exactly since ~ expands to $HOME if it is present.
cacheHome = os.path.join(os.path.expanduser('~'), '.cache')
dir = os.path.join(cacheHome, 'snscrape')
if not os.path.isdir(dir):
# os.makedirs does not apply mode recursively anymore. https://bugs.python.org/issue42367
# This ensures that the XDG_CACHE_HOME is created with the right permissions.
os.makedirs(os.path.dirname(dir), mode = 0o700, exist_ok = True)
os.mkdir(dir, mode = 0o700)
self._file = os.path.join(dir, 'cli-twitter-guest-token.json')
self._lockFile = f'{self._file}.lock'
self._lock = filelock.FileLock(self._lockFile)
def _read(self):
with self._lock:
if not os.path.exists(self._file):
return None
_logger.info(f'Reading guest token from {self._file}')
with open(self._file, 'r') as fp:
o = json.load(fp)
self._token = o['token']
self._setTime = o['setTime']
def _write(self):
with self._lock:
_logger.info(f'Writing guest token to {self._file}')
with open(self._file, 'w') as fp:
json.dump({'token': self.token, 'setTime': self.setTime}, fp)
@property
def token(self):
if not self._token:
self._read()
return self._token
@token.setter
def token(self, token):
super(type(self), type(self)).token.__set__(self, token) # https://bugs.python.org/issue14965
self._write()
@property
def setTime(self):
self.token # Implicitly reads from the file if necessary
return self._setTime
def reset(self):
super().reset()
with self._lock:
os.remove(self._file)
class _TwitterAPIScraper(snscrape.base.Scraper): class _TwitterAPIScraper(snscrape.base.Scraper):
def __init__(self, baseUrl, guestTokenManager = None, **kwargs): def __init__(self, baseUrl, guestTokenManager = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
@@ -552,6 +610,11 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
labelKwargs['longDescription'] = label['longDescription']['text'] labelKwargs['longDescription'] = label['longDescription']['text']
return UserLabel(**labelKwargs) return UserLabel(**labelKwargs)
@classmethod
def _construct(cls, argparseArgs, *args, **kwargs):
kwargs['guestTokenManager'] = _CLIGuestTokenManager()
return super()._construct(argparseArgs, *args, **kwargs)
class TwitterSearchScraper(_TwitterAPIScraper): class TwitterSearchScraper(_TwitterAPIScraper):
name = 'twitter-search' name = 'twitter-search'