diff --git a/cisticola/base.py b/cisticola/base.py index 28ff9f8..5d0c12f 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -11,7 +11,7 @@ import pytesseract import PIL import exiftool import re -from langdetect import detect, DetectorFactory +from langdetect import PROFILES_DIRECTORY, DetectorFactory from langdetect.lang_detect_exception import LangDetectException from loguru import logger import spacy @@ -165,6 +165,15 @@ nlp_ru = spacy.load('ru_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ nlp_nl = spacy.load('nl_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler']) nlp_xx = spacy.load('xx_ent_wiki_sm') +factory = DetectorFactory() +factory.load_profile(PROFILES_DIRECTORY) +detector = factory.create() + +def detect(text, detector=detector): + detector.text = "" + detector.append(text) + return detector.detect() + @dataclass class Post: """An object with fields for columns in the analysis table""" diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index 32ed37c..6b85b80 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -192,22 +192,25 @@ class ETLController: session = self.session() BATCH_SIZE = 50000 - offset = 0 - batch = [] + batch = (session.query(ScraperResult) + .join(Post, isouter=True) + .where(Post.raw_id == None) + .order_by(ScraperResult.date.asc()) + .limit(BATCH_SIZE) + ).all() - query = (session.query(ScraperResult) - .join(Post, isouter=True) - .where(Post.raw_id == None) - .order_by(ScraperResult.date.asc()) - ) + while len(batch) > 0: + logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}") - while len(batch) > 0 or offset == 0: - logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {offset}") + batch = (session.query(ScraperResult) + .join(Post, isouter=True) + .where(Post.raw_id == None) + .where(ScraperResult.date >= max(batch, key=lambda v: v.date).date) + .order_by(ScraperResult.date.asc()) + .limit(BATCH_SIZE) + ).all() - batch = query.slice(offset, offset + BATCH_SIZE).all() - offset += BATCH_SIZE - - logger.info(f"Found {len(batch)} items to ETL ({offset} already processed)") + logger.info(f"Found {len(batch)} items to ETL") self.transform_results(batch, hydrate=hydrate) @@ -233,8 +236,8 @@ class ETLController: session.commit() break - if handled == False: - logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})") + if handled == False: + logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})") @logger.catch(reraise=True)