mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 13:28:34 +03:00
added more docstrings and comments
This commit is contained in:
@@ -1,14 +1,15 @@
|
|||||||
from typing import List
|
from typing import List
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import tempfile
|
||||||
|
import json
|
||||||
|
import io
|
||||||
|
|
||||||
from sqlalchemy.orm import registry
|
from sqlalchemy.orm import registry
|
||||||
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
|
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
|
||||||
import pytesseract
|
import pytesseract
|
||||||
import PIL
|
import PIL
|
||||||
import io
|
|
||||||
import exiftool
|
import exiftool
|
||||||
import json
|
|
||||||
import os
|
|
||||||
|
|
||||||
from .utils import make_request
|
from .utils import make_request
|
||||||
|
|
||||||
@@ -123,6 +124,85 @@ class TransformedResult:
|
|||||||
#: Text of the original post
|
#: Text of the original post
|
||||||
content: str
|
content: str
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Media:
|
||||||
|
"""Base class for organizing information about a media file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
#: ID number of the media's corresponding scraped post in the ``raw_data`` table.
|
||||||
|
raw_id: int
|
||||||
|
|
||||||
|
#: ID number of the media's corresponging scraped post in the ``analysis`` table.
|
||||||
|
post: int
|
||||||
|
|
||||||
|
#: URL of the original post.
|
||||||
|
url: str
|
||||||
|
|
||||||
|
#: Original URL of the media from the the original post.
|
||||||
|
original_url: str
|
||||||
|
|
||||||
|
#: JSON dump of the dict containing metadata information for the media file.
|
||||||
|
exif: str = None
|
||||||
|
|
||||||
|
def get_blob(self):
|
||||||
|
"""Download media file as bytes blob.
|
||||||
|
"""
|
||||||
|
|
||||||
|
blob = make_request(self.url)
|
||||||
|
return blob.content
|
||||||
|
|
||||||
|
def hydrate(self, blob = None):
|
||||||
|
"""Download media file as bytes blob and extract data from content.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if blob is None:
|
||||||
|
blob = self.get_blob()
|
||||||
|
|
||||||
|
self.hydrate_exif(blob)
|
||||||
|
|
||||||
|
def hydrate_exif(self, blob):
|
||||||
|
"""Extract Exif metadata from bytes blob.
|
||||||
|
"""
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile() as temp_file:
|
||||||
|
temp_file.write(blob)
|
||||||
|
|
||||||
|
with exiftool.ExifTool() as et:
|
||||||
|
exif = et.get_metadata(temp_file.name)
|
||||||
|
self.exif = json.dumps(exif)
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Image(Media):
|
||||||
|
"""Class for organizing information about an image file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
#: Extracted OCR content from image
|
||||||
|
ocr: str = None
|
||||||
|
|
||||||
|
def hydrate(self, blob=None):
|
||||||
|
"""Download image file as bytes blob and extract Exif and OCR content
|
||||||
|
from the image.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if blob is None:
|
||||||
|
blob = self.get_blob()
|
||||||
|
|
||||||
|
super().hydrate(blob)
|
||||||
|
self.hydrate_ocr(blob)
|
||||||
|
|
||||||
|
def hydrate_ocr(self, blob):
|
||||||
|
"""Extract OCR (optical character recognition) data from image bytes blob.
|
||||||
|
"""
|
||||||
|
|
||||||
|
image = PIL.Image.open(io.BytesIO(blob))
|
||||||
|
self.ocr = pytesseract.image_to_string(image)
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Video(Media):
|
||||||
|
"""Class for organizing information about an image file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
mapper_registry = registry()
|
mapper_registry = registry()
|
||||||
|
|
||||||
@@ -138,7 +218,6 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
|
|||||||
Column('date_archived', DateTime),
|
Column('date_archived', DateTime),
|
||||||
Column('archived_urls', JSON))
|
Column('archived_urls', JSON))
|
||||||
|
|
||||||
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
|
|
||||||
|
|
||||||
analysis_table = Table('analysis', mapper_registry.metadata,
|
analysis_table = Table('analysis', mapper_registry.metadata,
|
||||||
Column('id', Integer, primary_key=True,
|
Column('id', Integer, primary_key=True,
|
||||||
@@ -153,72 +232,21 @@ analysis_table = Table('analysis', mapper_registry.metadata,
|
|||||||
Column('url', String),
|
Column('url', String),
|
||||||
Column('author_id', String),
|
Column('author_id', String),
|
||||||
Column('author_username', String),
|
Column('author_username', String),
|
||||||
Column('content', String)
|
Column('content', String))
|
||||||
)
|
|
||||||
|
|
||||||
mapper_registry.map_imperatively(TransformedResult, analysis_table)
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Media:
|
|
||||||
raw_id: int
|
|
||||||
post: int
|
|
||||||
url: str
|
|
||||||
original_url: str
|
|
||||||
|
|
||||||
exif: str = None
|
|
||||||
|
|
||||||
def get_blob(self):
|
|
||||||
blob = make_request(self.url)
|
|
||||||
return blob.content
|
|
||||||
|
|
||||||
def hydrate(self, blob = None):
|
|
||||||
if blob is None:
|
|
||||||
blob = self.get_blob()
|
|
||||||
|
|
||||||
self.hydrate_exif(blob)
|
|
||||||
|
|
||||||
def hydrate_exif(self, blob):
|
|
||||||
f = open('tmp', 'wb')
|
|
||||||
f.write(blob)
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
with exiftool.ExifTool() as et:
|
|
||||||
exif = et.get_metadata('tmp')
|
|
||||||
self.exif = json.dumps(exif)
|
|
||||||
|
|
||||||
os.remove('tmp')
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Image(Media):
|
|
||||||
ocr: str = None
|
|
||||||
|
|
||||||
def hydrate(self, blob=None):
|
|
||||||
if blob is None:
|
|
||||||
blob = self.get_blob()
|
|
||||||
|
|
||||||
super().hydrate(blob)
|
|
||||||
self.hydrate_ocr(blob)
|
|
||||||
|
|
||||||
def hydrate_ocr(self, blob):
|
|
||||||
image = PIL.Image.open(io.BytesIO(blob))
|
|
||||||
self.ocr = pytesseract.image_to_string(image)
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Video(Media):
|
|
||||||
pass
|
|
||||||
|
|
||||||
media_table = Table('media', mapper_registry.metadata,
|
media_table = Table('media', mapper_registry.metadata,
|
||||||
Column('id', Integer, primary_key=True,
|
Column('id', Integer, primary_key=True,
|
||||||
autoincrement=True),
|
autoincrement=True),
|
||||||
Column('type', String),
|
Column('type', String),
|
||||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||||
Column('post', Integer, ForeignKey('analysis.id')),
|
Column('post', Integer, ForeignKey('analysis.id')),
|
||||||
Column('url', String),
|
Column('url', String),
|
||||||
Column('original_url', String),
|
Column('original_url', String),
|
||||||
Column('exif', String),
|
Column('exif', String),
|
||||||
Column('ocr', String)
|
Column('ocr', String))
|
||||||
)
|
|
||||||
|
|
||||||
|
mapper_registry.map_imperatively(TransformedResult, analysis_table)
|
||||||
|
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
|
||||||
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
|
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
|
||||||
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
|
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
|
||||||
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
|
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
|
||||||
@@ -278,8 +278,8 @@ class ScraperController:
|
|||||||
self.session.configure(bind=self.engine)
|
self.session.configure(bind=self.engine)
|
||||||
|
|
||||||
def reset_db(self):
|
def reset_db(self):
|
||||||
|
"""Drop all data from the SQLAlchemy database.
|
||||||
|
"""
|
||||||
|
|
||||||
mapper_registry.metadata.drop_all(bind=self.engine)
|
mapper_registry.metadata.drop_all(bind=self.engine)
|
||||||
self.connect_to_db(self.engine)
|
self.connect_to_db(self.engine)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[pytest]
|
[pytest]
|
||||||
minversion =
|
minversion =
|
||||||
6.0.2
|
7.0.0
|
||||||
testpaths =
|
testpaths =
|
||||||
tests/
|
tests/
|
||||||
python_files =
|
python_files =
|
||||||
@@ -14,3 +14,4 @@ addopts =
|
|||||||
filterwarnings =
|
filterwarnings =
|
||||||
ignore:the imp module is deprecated:DeprecationWarning
|
ignore:the imp module is deprecated:DeprecationWarning
|
||||||
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
|
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
|
||||||
|
ignore:invalid escape sequence:DeprecationWarning
|
||||||
Reference in New Issue
Block a user