From 294f6a5172e2acf02f5081ee50cb6f23e5a02d59 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Sun, 25 Dec 2022 15:19:05 -0800 Subject: [PATCH] possible performance improvement --- cisticola/base.py | 32 ++++++++++++++++---------------- cisticola/transformer/base.py | 8 +++++--- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/cisticola/base.py b/cisticola/base.py index d94bfd6..1022317 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -262,28 +262,28 @@ class Post: self.outlinks += urls self.outlinks = list(set(outlink for outlink in self.outlinks)) - HASHTAG_REGEX = r"(?:^|\s)[##]{1}(\w+)" + # HASHTAG_REGEX = r"(?:^|\s)[##]{1}(\w+)" - hashtags = re.findall(HASHTAG_REGEX, self.content) - self.hashtags += hashtags - self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags)) + # hashtags = re.findall(HASHTAG_REGEX, self.content) + # self.hashtags += hashtags + # self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags)) - # regex patterns for finding crypto addresses - BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b' - ETHER_REGEX = r'(0x[a-fA-F0-9]{40})' + # # regex patterns for finding crypto addresses + # BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b' + # ETHER_REGEX = r'(0x[a-fA-F0-9]{40})' - self.cryptocurrency_addresses = [m[0] for m in re.findall(BTC_REGEX, self.content)] + re.findall(ETHER_REGEX, self.content) + # self.cryptocurrency_addresses = [m[0] for m in re.findall(BTC_REGEX, self.content)] + re.findall(ETHER_REGEX, self.content) - try: - self.detected_language = detect(self.content) - except LangDetectException: - self.detected_language = "" + # try: + # self.detected_language = detect(self.content) + # except LangDetectException: + # self.detected_language = "" - # Dutch (NL) is often misdetected as Afrikaans (af) - if self.detected_language == "af": - self.detected_language = "nl" + # # Dutch (NL) is often misdetected as Afrikaans (af) + # if self.detected_language == "af": + # self.detected_language = "nl" - self.hydrate_spacy() + # self.hydrate_spacy() def hydrate_spacy(self): ner_only = False diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index 2649032..fd0eeb5 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -307,7 +307,7 @@ class ETLController: session = self.session() - BATCH_SIZE = 5000 + BATCH_SIZE = 50000 batch = [] logger.info(f"Fetching first post batch of {BATCH_SIZE} to re-transform") @@ -324,12 +324,14 @@ class ETLController: self.retransform_results(batch, hydrate=hydrate, columns=columns) - logger.info(f"Fetching posts batch of {BATCH_SIZE} to re-transform, offset {max([raw.date for raw, _ in batch])}") + max_date = max([raw.date for raw, _ in batch]) + + logger.info(f"Fetching posts batch of {BATCH_SIZE} to re-transform, offset {max_date}") batch = (session.query(ScraperResult, Post) .filter(Post.raw_id == ScraperResult.id) .filter_by(**query_kwargs) - .where(ScraperResult.date >= max([raw.date for raw, _ in batch])) + .where(ScraperResult.date >= max_date) .order_by(ScraperResult.date.asc()) .limit(BATCH_SIZE) ).all()