added more docstrings and comments

This commit is contained in:
Tristan Lee
2022-03-14 19:38:33 -05:00
parent c3eab2f176
commit a3c859ec79
3 changed files with 94 additions and 65 deletions

View File

@@ -1,14 +1,15 @@
from typing import List
from dataclasses import dataclass
from datetime import datetime
import tempfile
import json
import io
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
import pytesseract
import PIL
import io
import exiftool
import json
import os
from .utils import make_request
@@ -123,6 +124,85 @@ class TransformedResult:
#: Text of the original post
content: str
@dataclass
class Media:
"""Base class for organizing information about a media file.
"""
#: ID number of the media's corresponding scraped post in the ``raw_data`` table.
raw_id: int
#: ID number of the media's corresponging scraped post in the ``analysis`` table.
post: int
#: URL of the original post.
url: str
#: Original URL of the media from the the original post.
original_url: str
#: JSON dump of the dict containing metadata information for the media file.
exif: str = None
def get_blob(self):
"""Download media file as bytes blob.
"""
blob = make_request(self.url)
return blob.content
def hydrate(self, blob = None):
"""Download media file as bytes blob and extract data from content.
"""
if blob is None:
blob = self.get_blob()
self.hydrate_exif(blob)
def hydrate_exif(self, blob):
"""Extract Exif metadata from bytes blob.
"""
with tempfile.NamedTemporaryFile() as temp_file:
temp_file.write(blob)
with exiftool.ExifTool() as et:
exif = et.get_metadata(temp_file.name)
self.exif = json.dumps(exif)
@dataclass
class Image(Media):
"""Class for organizing information about an image file.
"""
#: Extracted OCR content from image
ocr: str = None
def hydrate(self, blob=None):
"""Download image file as bytes blob and extract Exif and OCR content
from the image.
"""
if blob is None:
blob = self.get_blob()
super().hydrate(blob)
self.hydrate_ocr(blob)
def hydrate_ocr(self, blob):
"""Extract OCR (optical character recognition) data from image bytes blob.
"""
image = PIL.Image.open(io.BytesIO(blob))
self.ocr = pytesseract.image_to_string(image)
@dataclass
class Video(Media):
"""Class for organizing information about an image file.
"""
pass
mapper_registry = registry()
@@ -138,7 +218,6 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('date_archived', DateTime),
Column('archived_urls', JSON))
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
analysis_table = Table('analysis', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
@@ -153,72 +232,21 @@ analysis_table = Table('analysis', mapper_registry.metadata,
Column('url', String),
Column('author_id', String),
Column('author_username', String),
Column('content', String)
)
mapper_registry.map_imperatively(TransformedResult, analysis_table)
@dataclass
class Media:
raw_id: int
post: int
url: str
original_url: str
exif: str = None
def get_blob(self):
blob = make_request(self.url)
return blob.content
def hydrate(self, blob = None):
if blob is None:
blob = self.get_blob()
self.hydrate_exif(blob)
def hydrate_exif(self, blob):
f = open('tmp', 'wb')
f.write(blob)
f.close()
with exiftool.ExifTool() as et:
exif = et.get_metadata('tmp')
self.exif = json.dumps(exif)
os.remove('tmp')
@dataclass
class Image(Media):
ocr: str = None
def hydrate(self, blob=None):
if blob is None:
blob = self.get_blob()
super().hydrate(blob)
self.hydrate_ocr(blob)
def hydrate_ocr(self, blob):
image = PIL.Image.open(io.BytesIO(blob))
self.ocr = pytesseract.image_to_string(image)
@dataclass
class Video(Media):
pass
Column('content', String))
media_table = Table('media', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('type', String),
Column('type', String),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('post', Integer, ForeignKey('analysis.id')),
Column('url', String),
Column('original_url', String),
Column('exif', String),
Column('ocr', String)
)
Column('ocr', String))
mapper_registry.map_imperatively(TransformedResult, analysis_table)
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')

View File

@@ -278,8 +278,8 @@ class ScraperController:
self.session.configure(bind=self.engine)
def reset_db(self):
"""Drop all data from the SQLAlchemy database.
"""
mapper_registry.metadata.drop_all(bind=self.engine)
self.connect_to_db(self.engine)
self.connect_to_db(self.engine)

View File

@@ -1,6 +1,6 @@
[pytest]
minversion =
6.0.2
7.0.0
testpaths =
tests/
python_files =
@@ -13,4 +13,5 @@ addopts =
--self-contained-html
filterwarnings =
ignore:the imp module is deprecated:DeprecationWarning
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
ignore:invalid escape sequence:DeprecationWarning