mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
refactored spacy.load of nlp models so that they're loading during ETLController initialization instead of on cisticola.base import
This commit is contained in:
12
Pipfile.lock
generated
12
Pipfile.lock
generated
@@ -59,19 +59,19 @@
|
||||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:1bae3e6222c7272af44ab4dc456fc4cb7d2d7044489b7a4a08f9cd0fbad6d213",
|
||||
"sha256:8630c2c38c3130e31e1a4182943aee8bc7dd1a1ad9729092b46fdbc8ac045a77"
|
||||
"sha256:a654d57e3882e7fd2c1260d604a44364a2fed00da4f52faf37e5901e71145df1",
|
||||
"sha256:e3c2e8e55c17af6671a5332d6ab4635ad9793c80d0ac6d78af7b30a994d0681b"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.28.19"
|
||||
"version": "==1.28.20"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:15f269945f319b0263cde9a61a25f2c9a83f6074b62cddae71edafec4a61e637",
|
||||
"sha256:724f9a1a91f88291f5adc6347705a31e52312c88cddd56e38709215b161e025a"
|
||||
"sha256:485ef175cd011ebc965f4577d8cc02a226c46bd608dd2bb75ce6938328cff0fd",
|
||||
"sha256:be51c5352162700e7beb0aa27af394adbbf86f8e7a2ca0c437d448d0a7b2bdfb"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==1.31.19"
|
||||
"version": "==1.31.20"
|
||||
},
|
||||
"brotli": {
|
||||
"hashes": [
|
||||
|
||||
@@ -32,6 +32,14 @@ from .utils import make_request
|
||||
# Disable decompression bomb check
|
||||
PIL.Image.MAX_IMAGE_PIXELS = 1024 * 1024 * 256
|
||||
|
||||
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
|
||||
|
||||
HASHTAG_REGEX = r"(?:^|\s)[##]{1}(\w+)"
|
||||
|
||||
# regex patterns for finding crypto addresses
|
||||
BTC_REGEX = r"\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b"
|
||||
ETHER_REGEX = r"(0x[a-fA-F0-9]{40})"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScraperResult:
|
||||
@@ -172,15 +180,6 @@ class ChannelInfo:
|
||||
pass
|
||||
|
||||
|
||||
nlp_en = spacy.load("en_core_web_sm", disable=["parser", "tok2vec", "attribute_ruler"])
|
||||
nlp_de = spacy.load("de_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
|
||||
nlp_it = spacy.load("it_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
|
||||
nlp_fr = spacy.load("fr_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
|
||||
nlp_ru = spacy.load("ru_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
|
||||
nlp_nl = spacy.load("nl_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
|
||||
nlp_xx = spacy.load("xx_ent_wiki_sm")
|
||||
|
||||
|
||||
@dataclass
|
||||
class Post:
|
||||
"""An object with fields for columns in the analysis table"""
|
||||
@@ -266,9 +265,7 @@ class Post:
|
||||
#: Video duration in seconds, if post is a video
|
||||
video_duration: int = None
|
||||
|
||||
def hydrate(self):
|
||||
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
|
||||
|
||||
def hydrate(self, nlp_models):
|
||||
# replace is here in order to prevent catastrophic backtracking
|
||||
urls = re.findall(
|
||||
URL_REGEX, self.content.replace("::::::::", "").replace("........", "")
|
||||
@@ -276,16 +273,10 @@ class Post:
|
||||
self.outlinks += urls
|
||||
self.outlinks = list(set(outlink for outlink in self.outlinks))
|
||||
|
||||
HASHTAG_REGEX = r"(?:^|\s)[##]{1}(\w+)"
|
||||
|
||||
hashtags = re.findall(HASHTAG_REGEX, self.content)
|
||||
self.hashtags += hashtags
|
||||
self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags))
|
||||
|
||||
# regex patterns for finding crypto addresses
|
||||
BTC_REGEX = r"\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b"
|
||||
ETHER_REGEX = r"(0x[a-fA-F0-9]{40})"
|
||||
|
||||
self.cryptocurrency_addresses = [
|
||||
m[0] for m in re.findall(BTC_REGEX, self.content)
|
||||
] + re.findall(ETHER_REGEX, self.content)
|
||||
@@ -299,25 +290,13 @@ class Post:
|
||||
if self.detected_language == "af":
|
||||
self.detected_language = "nl"
|
||||
|
||||
self.hydrate_spacy()
|
||||
self.hydrate_spacy(nlp_models=nlp_models)
|
||||
|
||||
def hydrate_spacy(self):
|
||||
def hydrate_spacy(self, nlp_models):
|
||||
ner_only = False
|
||||
|
||||
if self.detected_language == "en":
|
||||
nlp = nlp_en
|
||||
elif self.detected_language == "de":
|
||||
nlp = nlp_de
|
||||
elif self.detected_language == "it":
|
||||
nlp = nlp_it
|
||||
elif self.detected_language == "fr":
|
||||
nlp = nlp_fr
|
||||
elif self.detected_language == "ru":
|
||||
nlp = nlp_ru
|
||||
elif self.detected_language == "nl":
|
||||
nlp = nlp_nl
|
||||
else:
|
||||
nlp = nlp_xx
|
||||
nlp = nlp_models.get(self.detected_language)
|
||||
if not nlp:
|
||||
nlp = nlp_models["xx"]
|
||||
ner_only = True
|
||||
|
||||
doc = nlp(self.content)
|
||||
|
||||
@@ -6,6 +6,7 @@ from sqlalchemy.engine.base import Engine
|
||||
from sqlalchemy.sql.expression import func
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
import spacy
|
||||
|
||||
from cisticola.base import (
|
||||
RawChannelInfo,
|
||||
@@ -117,6 +118,7 @@ class ETLController:
|
||||
|
||||
def __init__(self):
|
||||
self.transformers = []
|
||||
self.load_nlp()
|
||||
|
||||
def register_transformer(self, transformer: Transformer):
|
||||
"""Add a single Transformer instance to the list of available Transformers.
|
||||
@@ -168,6 +170,19 @@ class ETLController:
|
||||
# logger.info(f"Bulk saved {len(self.posts_to_insert)} posts")
|
||||
self.posts_to_insert = []
|
||||
|
||||
def load_nlp(self):
|
||||
"""Load spaCy models into a dict."""
|
||||
kwargs = {"disable": ["parser", "tok2vec", "attribute_ruler"]}
|
||||
self.nlp_models = {
|
||||
"en": spacy.load("en_core_web_sm", **kwargs),
|
||||
"de": spacy.load("de_core_news_sm", **kwargs),
|
||||
"it": spacy.load("it_core_news_sm", **kwargs),
|
||||
"fr": spacy.load("fr_core_news_sm", **kwargs),
|
||||
"ru": spacy.load("ru_core_news_sm", **kwargs),
|
||||
"nl": spacy.load("nl_core_news_sm", **kwargs),
|
||||
"xx": spacy.load("xx_ent_wiki_sm", **kwargs),
|
||||
}
|
||||
|
||||
def insert_post(self, obj, session, hydrate: bool = True, flush: bool = False):
|
||||
"""Insert an object into the connected database.
|
||||
|
||||
@@ -187,7 +202,9 @@ class ETLController:
|
||||
-------
|
||||
None, or instance of ORM-mapped class from ``cisticola.base`` that has been inserted into the database, with additional data fields if ``flush`` argument is ``True``.
|
||||
"""
|
||||
if hydrate and type(obj) != Video:
|
||||
if hydrate and type(obj) == Post:
|
||||
obj.hydrate(nlp_models=self.nlp_models)
|
||||
elif hydrate and type(obj) != Video:
|
||||
obj.hydrate()
|
||||
|
||||
if flush:
|
||||
@@ -307,8 +324,10 @@ class ETLController:
|
||||
|
||||
return instance
|
||||
|
||||
# Don't hydrate videos, because they can be quite large and this is time consuming
|
||||
if hydrate and type(obj) != Video:
|
||||
# Don't hydrate videos, because they can be quite large and this is time consuming, include spaCy models
|
||||
if hydrate and type(obj) == Post:
|
||||
obj.hydrate(nlp_models=self.nlp_models)
|
||||
elif hydrate and type(obj) != Video:
|
||||
obj.hydrate()
|
||||
|
||||
session.add(obj)
|
||||
|
||||
Reference in New Issue
Block a user