possible performance improvement

This commit is contained in:
Tristan Lee
2022-12-25 15:19:05 -08:00
parent d80ad442da
commit 294f6a5172
2 changed files with 21 additions and 19 deletions

View File

@@ -262,28 +262,28 @@ class Post:
self.outlinks += urls
self.outlinks = list(set(outlink for outlink in self.outlinks))
HASHTAG_REGEX = r"(?:^|\s)[#]{1}(\w+)"
# HASHTAG_REGEX = r"(?:^|\s)[#]{1}(\w+)"
hashtags = re.findall(HASHTAG_REGEX, self.content)
self.hashtags += hashtags
self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags))
# hashtags = re.findall(HASHTAG_REGEX, self.content)
# self.hashtags += hashtags
# self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags))
# regex patterns for finding crypto addresses
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
ETHER_REGEX = r'(0x[a-fA-F0-9]{40})'
# # regex patterns for finding crypto addresses
# BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
# ETHER_REGEX = r'(0x[a-fA-F0-9]{40})'
self.cryptocurrency_addresses = [m[0] for m in re.findall(BTC_REGEX, self.content)] + re.findall(ETHER_REGEX, self.content)
# self.cryptocurrency_addresses = [m[0] for m in re.findall(BTC_REGEX, self.content)] + re.findall(ETHER_REGEX, self.content)
try:
self.detected_language = detect(self.content)
except LangDetectException:
self.detected_language = ""
# try:
# self.detected_language = detect(self.content)
# except LangDetectException:
# self.detected_language = ""
# Dutch (NL) is often misdetected as Afrikaans (af)
if self.detected_language == "af":
self.detected_language = "nl"
# # Dutch (NL) is often misdetected as Afrikaans (af)
# if self.detected_language == "af":
# self.detected_language = "nl"
self.hydrate_spacy()
# self.hydrate_spacy()
def hydrate_spacy(self):
ner_only = False

View File

@@ -307,7 +307,7 @@ class ETLController:
session = self.session()
BATCH_SIZE = 5000
BATCH_SIZE = 50000
batch = []
logger.info(f"Fetching first post batch of {BATCH_SIZE} to re-transform")
@@ -324,12 +324,14 @@ class ETLController:
self.retransform_results(batch, hydrate=hydrate, columns=columns)
logger.info(f"Fetching posts batch of {BATCH_SIZE} to re-transform, offset {max([raw.date for raw, _ in batch])}")
max_date = max([raw.date for raw, _ in batch])
logger.info(f"Fetching posts batch of {BATCH_SIZE} to re-transform, offset {max_date}")
batch = (session.query(ScraperResult, Post)
.filter(Post.raw_id == ScraperResult.id)
.filter_by(**query_kwargs)
.where(ScraperResult.date >= max([raw.date for raw, _ in batch]))
.where(ScraperResult.date >= max_date)
.order_by(ScraperResult.date.asc())
.limit(BATCH_SIZE)
).all()