modified langdetect detect method to decrease run-time, fixed indent error in transform_info, prototyped removal of offset in transform_all_untransformed

This commit is contained in:
Tristan Lee
2022-07-01 03:52:14 -05:00
parent ed4723ed1e
commit 09f99392ef
2 changed files with 28 additions and 16 deletions

View File

@@ -11,7 +11,7 @@ import pytesseract
import PIL
import exiftool
import re
from langdetect import detect, DetectorFactory
from langdetect import PROFILES_DIRECTORY, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from loguru import logger
import spacy
@@ -165,6 +165,15 @@ nlp_ru = spacy.load('ru_core_news_sm', disable=['parser', 'tok2vec', 'attribute_
nlp_nl = spacy.load('nl_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_xx = spacy.load('xx_ent_wiki_sm')
factory = DetectorFactory()
factory.load_profile(PROFILES_DIRECTORY)
detector = factory.create()
def detect(text, detector=detector):
detector.text = ""
detector.append(text)
return detector.detect()
@dataclass
class Post:
"""An object with fields for columns in the analysis table"""

View File

@@ -192,22 +192,25 @@ class ETLController:
session = self.session()
BATCH_SIZE = 50000
offset = 0
batch = []
batch = (session.query(ScraperResult)
.join(Post, isouter=True)
.where(Post.raw_id == None)
.order_by(ScraperResult.date.asc())
.limit(BATCH_SIZE)
).all()
query = (session.query(ScraperResult)
.join(Post, isouter=True)
.where(Post.raw_id == None)
.order_by(ScraperResult.date.asc())
)
while len(batch) > 0:
logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}")
while len(batch) > 0 or offset == 0:
logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {offset}")
batch = (session.query(ScraperResult)
.join(Post, isouter=True)
.where(Post.raw_id == None)
.where(ScraperResult.date >= max(batch, key=lambda v: v.date).date)
.order_by(ScraperResult.date.asc())
.limit(BATCH_SIZE)
).all()
batch = query.slice(offset, offset + BATCH_SIZE).all()
offset += BATCH_SIZE
logger.info(f"Found {len(batch)} items to ETL ({offset} already processed)")
logger.info(f"Found {len(batch)} items to ETL")
self.transform_results(batch, hydrate=hydrate)
@@ -233,8 +236,8 @@ class ETLController:
session.commit()
break
if handled == False:
logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})")
if handled == False:
logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})")
@logger.catch(reraise=True)