mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
modified langdetect detect method to decrease run-time, fixed indent error in transform_info, prototyped removal of offset in transform_all_untransformed
This commit is contained in:
@@ -11,7 +11,7 @@ import pytesseract
|
||||
import PIL
|
||||
import exiftool
|
||||
import re
|
||||
from langdetect import detect, DetectorFactory
|
||||
from langdetect import PROFILES_DIRECTORY, DetectorFactory
|
||||
from langdetect.lang_detect_exception import LangDetectException
|
||||
from loguru import logger
|
||||
import spacy
|
||||
@@ -165,6 +165,15 @@ nlp_ru = spacy.load('ru_core_news_sm', disable=['parser', 'tok2vec', 'attribute_
|
||||
nlp_nl = spacy.load('nl_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
|
||||
nlp_xx = spacy.load('xx_ent_wiki_sm')
|
||||
|
||||
factory = DetectorFactory()
|
||||
factory.load_profile(PROFILES_DIRECTORY)
|
||||
detector = factory.create()
|
||||
|
||||
def detect(text, detector=detector):
|
||||
detector.text = ""
|
||||
detector.append(text)
|
||||
return detector.detect()
|
||||
|
||||
@dataclass
|
||||
class Post:
|
||||
"""An object with fields for columns in the analysis table"""
|
||||
|
||||
@@ -192,22 +192,25 @@ class ETLController:
|
||||
session = self.session()
|
||||
|
||||
BATCH_SIZE = 50000
|
||||
offset = 0
|
||||
batch = []
|
||||
batch = (session.query(ScraperResult)
|
||||
.join(Post, isouter=True)
|
||||
.where(Post.raw_id == None)
|
||||
.order_by(ScraperResult.date.asc())
|
||||
.limit(BATCH_SIZE)
|
||||
).all()
|
||||
|
||||
query = (session.query(ScraperResult)
|
||||
.join(Post, isouter=True)
|
||||
.where(Post.raw_id == None)
|
||||
.order_by(ScraperResult.date.asc())
|
||||
)
|
||||
while len(batch) > 0:
|
||||
logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}")
|
||||
|
||||
while len(batch) > 0 or offset == 0:
|
||||
logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {offset}")
|
||||
batch = (session.query(ScraperResult)
|
||||
.join(Post, isouter=True)
|
||||
.where(Post.raw_id == None)
|
||||
.where(ScraperResult.date >= max(batch, key=lambda v: v.date).date)
|
||||
.order_by(ScraperResult.date.asc())
|
||||
.limit(BATCH_SIZE)
|
||||
).all()
|
||||
|
||||
batch = query.slice(offset, offset + BATCH_SIZE).all()
|
||||
offset += BATCH_SIZE
|
||||
|
||||
logger.info(f"Found {len(batch)} items to ETL ({offset} already processed)")
|
||||
logger.info(f"Found {len(batch)} items to ETL")
|
||||
|
||||
self.transform_results(batch, hydrate=hydrate)
|
||||
|
||||
@@ -233,8 +236,8 @@ class ETLController:
|
||||
session.commit()
|
||||
break
|
||||
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})")
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})")
|
||||
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
|
||||
Reference in New Issue
Block a user