mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-07 18:18:29 +03:00
Cache Twitter tokens on disk from the CLI for reuse between scrapes
Closes #339
This commit is contained in:
1
setup.py
1
setup.py
@@ -28,6 +28,7 @@ setuptools.setup(
|
||||
'lxml',
|
||||
'beautifulsoup4',
|
||||
'pytz; python_version < "3.9.0"',
|
||||
'filelock',
|
||||
],
|
||||
python_requires = '~=3.8',
|
||||
extras_require = {
|
||||
|
||||
@@ -20,10 +20,12 @@ import dataclasses
|
||||
import datetime
|
||||
import email.utils
|
||||
import enum
|
||||
import filelock
|
||||
import itertools
|
||||
import json
|
||||
import random
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import snscrape.base
|
||||
import string
|
||||
@@ -210,6 +212,62 @@ class GuestTokenManager:
|
||||
self._setTime = 0.0
|
||||
|
||||
|
||||
class _CLIGuestTokenManager(GuestTokenManager):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
cacheHome = os.environ.get('XDG_CACHE_HOME')
|
||||
if not cacheHome or not os.path.isabs(cacheHome):
|
||||
# This should be ${HOME}/.cache, but the HOME environment variable may not exist on non-POSIX-compliant systems.
|
||||
# On POSIX-compliant systems, the XDG Base Directory specification is followed exactly since ~ expands to $HOME if it is present.
|
||||
cacheHome = os.path.join(os.path.expanduser('~'), '.cache')
|
||||
dir = os.path.join(cacheHome, 'snscrape')
|
||||
if not os.path.isdir(dir):
|
||||
# os.makedirs does not apply mode recursively anymore. https://bugs.python.org/issue42367
|
||||
# This ensures that the XDG_CACHE_HOME is created with the right permissions.
|
||||
os.makedirs(os.path.dirname(dir), mode = 0o700, exist_ok = True)
|
||||
os.mkdir(dir, mode = 0o700)
|
||||
self._file = os.path.join(dir, 'cli-twitter-guest-token.json')
|
||||
self._lockFile = f'{self._file}.lock'
|
||||
self._lock = filelock.FileLock(self._lockFile)
|
||||
|
||||
def _read(self):
|
||||
with self._lock:
|
||||
if not os.path.exists(self._file):
|
||||
return None
|
||||
_logger.info(f'Reading guest token from {self._file}')
|
||||
with open(self._file, 'r') as fp:
|
||||
o = json.load(fp)
|
||||
self._token = o['token']
|
||||
self._setTime = o['setTime']
|
||||
|
||||
def _write(self):
|
||||
with self._lock:
|
||||
_logger.info(f'Writing guest token to {self._file}')
|
||||
with open(self._file, 'w') as fp:
|
||||
json.dump({'token': self.token, 'setTime': self.setTime}, fp)
|
||||
|
||||
@property
|
||||
def token(self):
|
||||
if not self._token:
|
||||
self._read()
|
||||
return self._token
|
||||
|
||||
@token.setter
|
||||
def token(self, token):
|
||||
super(type(self), type(self)).token.__set__(self, token) # https://bugs.python.org/issue14965
|
||||
self._write()
|
||||
|
||||
@property
|
||||
def setTime(self):
|
||||
self.token # Implicitly reads from the file if necessary
|
||||
return self._setTime
|
||||
|
||||
def reset(self):
|
||||
super().reset()
|
||||
with self._lock:
|
||||
os.remove(self._file)
|
||||
|
||||
|
||||
class _TwitterAPIScraper(snscrape.base.Scraper):
|
||||
def __init__(self, baseUrl, guestTokenManager = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
@@ -552,6 +610,11 @@ class _TwitterAPIScraper(snscrape.base.Scraper):
|
||||
labelKwargs['longDescription'] = label['longDescription']['text']
|
||||
return UserLabel(**labelKwargs)
|
||||
|
||||
@classmethod
|
||||
def _construct(cls, argparseArgs, *args, **kwargs):
|
||||
kwargs['guestTokenManager'] = _CLIGuestTokenManager()
|
||||
return super()._construct(argparseArgs, *args, **kwargs)
|
||||
|
||||
|
||||
class TwitterSearchScraper(_TwitterAPIScraper):
|
||||
name = 'twitter-search'
|
||||
|
||||
Reference in New Issue
Block a user