mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-13 13:58:33 +03:00
Fix bugs in Gettr/Rumble transformers, avoid offset in batch requests
This commit is contained in:
@@ -191,26 +191,36 @@ class ETLController:
|
|||||||
|
|
||||||
session = self.session()
|
session = self.session()
|
||||||
|
|
||||||
BATCH_SIZE = 50000
|
BATCH_SIZE = 5000
|
||||||
offset = 0
|
offset = 0
|
||||||
batch = []
|
batch = []
|
||||||
|
|
||||||
query = (session.query(ScraperResult)
|
logger.info(f"Fetching first untransformed post batch of {BATCH_SIZE}")
|
||||||
|
|
||||||
|
batch = (session.query(ScraperResult)
|
||||||
.join(Post, isouter=True)
|
.join(Post, isouter=True)
|
||||||
.where(Post.raw_id == None)
|
.where(Post.raw_id == None)
|
||||||
.order_by(ScraperResult.date.asc())
|
.order_by(ScraperResult.date.asc())
|
||||||
)
|
.limit(BATCH_SIZE)
|
||||||
|
).all()
|
||||||
|
|
||||||
while len(batch) > 0 or offset == 0:
|
while len(batch) > 0:
|
||||||
logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {offset}")
|
logger.info(f"Found {len(batch)} items to ETL")
|
||||||
|
|
||||||
batch = query.slice(offset, offset + BATCH_SIZE).all()
|
|
||||||
offset += BATCH_SIZE
|
|
||||||
|
|
||||||
logger.info(f"Found {len(batch)} items to ETL ({offset} already processed)")
|
|
||||||
|
|
||||||
self.transform_results(batch, hydrate=hydrate)
|
self.transform_results(batch, hydrate=hydrate)
|
||||||
|
|
||||||
|
logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {max(batch, key=lambda v: v.date).date}")
|
||||||
|
|
||||||
|
batch = (session.query(ScraperResult)
|
||||||
|
.join(Post, isouter=True)
|
||||||
|
.where(Post.raw_id == None)
|
||||||
|
.where(ScraperResult.date >= max(batch, key=lambda v: v.date).date)
|
||||||
|
.order_by(ScraperResult.date.asc())
|
||||||
|
.limit(BATCH_SIZE)
|
||||||
|
).all()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@logger.catch(reraise=True)
|
@logger.catch(reraise=True)
|
||||||
def transform_info(self, results: List[ChannelInfo]):
|
def transform_info(self, results: List[ChannelInfo]):
|
||||||
if self.session is None:
|
if self.session is None:
|
||||||
@@ -221,9 +231,9 @@ class ETLController:
|
|||||||
|
|
||||||
for result in results:
|
for result in results:
|
||||||
if result.scraper is not None and result.platform is not None:
|
if result.scraper is not None and result.platform is not None:
|
||||||
for transformer in self.transformers:
|
handled = False
|
||||||
handled = False
|
|
||||||
|
|
||||||
|
for transformer in self.transformers:
|
||||||
if transformer.can_handle(result):
|
if transformer.can_handle(result):
|
||||||
logger.trace(f"{transformer} is handling raw info result {result.id} ({result.date_archived})")
|
logger.trace(f"{transformer} is handling raw info result {result.id} ({result.date_archived})")
|
||||||
handled = True
|
handled = True
|
||||||
@@ -245,7 +255,7 @@ class ETLController:
|
|||||||
|
|
||||||
session = self.session()
|
session = self.session()
|
||||||
|
|
||||||
BATCH_SIZE = 50000
|
BATCH_SIZE = 10000
|
||||||
offset = 0
|
offset = 0
|
||||||
batch = []
|
batch = []
|
||||||
|
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ class GettrTransformer(Transformer):
|
|||||||
|
|
||||||
def _get_channel_id(self, username: str, category: str, insert: Callable, session):
|
def _get_channel_id(self, username: str, category: str, insert: Callable, session):
|
||||||
|
|
||||||
channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(username), platform = 'Gettr').first()
|
channel = session.query(Channel).where((func.lower(Channel.screenname)==func.lower(username)) & (Channel.platform == 'Gettr')).first()
|
||||||
|
|
||||||
if channel is None:
|
if channel is None:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ def _process_number(s):
|
|||||||
if s is None:
|
if s is None:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
s = s.replace(' ', '')
|
s = s.replace(' ', '').replace(',','')
|
||||||
if s.endswith('M'):
|
if s.endswith('M'):
|
||||||
return int(float(s[:-1]) * 1e6)
|
return int(float(s[:-1]) * 1e6)
|
||||||
elif s.endswith('K'):
|
elif s.endswith('K'):
|
||||||
|
|||||||
Reference in New Issue
Block a user