From a3c859ec79c9f31d5b67e2de4e98354f10eafe28 Mon Sep 17 00:00:00 2001
From: Tristan Lee <tristan@bellingcat.com>
Date: Mon, 14 Mar 2022 19:38:33 -0500
Subject: [PATCH] added more docstrings and comments

---
 cisticola/base.py         | 148 ++++++++++++++++++++++----------------
 cisticola/scraper/base.py |   6 +-
 pytest.ini                |   5 +-
 3 files changed, 94 insertions(+), 65 deletions(-)

diff --git a/cisticola/base.py b/cisticola/base.py
index fc07846..dfaaee8 100644
--- a/cisticola/base.py
+++ b/cisticola/base.py
@@ -1,14 +1,15 @@
 from typing import List
 from dataclasses import dataclass
 from datetime import datetime
+import tempfile 
+import json
+import io
+
 from sqlalchemy.orm import registry
 from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
 import pytesseract
 import PIL
-import io
 import exiftool
-import json
-import os
 
 from .utils import make_request
 
@@ -123,6 +124,85 @@ class TransformedResult:
     #: Text of the original post
     content: str
 
+@dataclass
+class Media:
+    """Base class for organizing information about a media file.
+    """
+
+    #: ID number of the media's corresponding scraped post in the ``raw_data`` table.
+    raw_id: int
+
+    #: ID number of the media's corresponging scraped post in the ``analysis`` table.
+    post: int
+
+    #: URL of the original post.
+    url: str
+
+    #: Original URL of the media from the the original post.
+    original_url: str
+
+    #: JSON dump of the dict containing metadata information for the media file.
+    exif: str = None
+
+    def get_blob(self):
+        """Download media file as bytes blob.
+        """
+
+        blob = make_request(self.url)
+        return blob.content
+
+    def hydrate(self, blob = None):
+        """Download media file as bytes blob and extract data from content.
+        """
+
+        if blob is None:
+            blob = self.get_blob()
+
+        self.hydrate_exif(blob)
+
+    def hydrate_exif(self, blob):
+        """Extract Exif metadata from bytes blob.
+        """
+
+        with tempfile.NamedTemporaryFile() as temp_file:
+            temp_file.write(blob)
+
+            with exiftool.ExifTool() as et:
+                exif = et.get_metadata(temp_file.name)
+                self.exif = json.dumps(exif)
+
+@dataclass
+class Image(Media):
+    """Class for organizing information about an image file. 
+    """
+
+    #: Extracted OCR content from image
+    ocr: str = None
+
+    def hydrate(self, blob=None):
+        """Download image file as bytes blob and extract Exif and OCR content 
+        from the image.
+        """
+
+        if blob is None:
+            blob = self.get_blob()
+
+        super().hydrate(blob)
+        self.hydrate_ocr(blob)
+
+    def hydrate_ocr(self, blob):
+        """Extract OCR (optical character recognition) data from image bytes blob.
+        """
+
+        image = PIL.Image.open(io.BytesIO(blob))
+        self.ocr = pytesseract.image_to_string(image)
+
+@dataclass
+class Video(Media):
+    """Class for organizing information about an image file. 
+    """
+    
+    pass
 
 mapper_registry = registry()
 
@@ -138,7 +218,6 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
                        Column('date_archived', DateTime),
                        Column('archived_urls', JSON))
 
-mapper_registry.map_imperatively(ScraperResult, raw_data_table)
 
 analysis_table = Table('analysis', mapper_registry.metadata,
                        Column('id', Integer, primary_key=True,
@@ -153,72 +232,21 @@ analysis_table = Table('analysis', mapper_registry.metadata,
                        Column('url', String),
                        Column('author_id', String),
                        Column('author_username', String),
-                       Column('content', String)
-                       )
-
-mapper_registry.map_imperatively(TransformedResult, analysis_table)
-
-@dataclass
-class Media:
-    raw_id: int
-    post: int
-    url: str
-    original_url: str
-
-    exif: str = None
-
-    def get_blob(self):
-        blob = make_request(self.url)
-        return blob.content
-
-    def hydrate(self, blob = None):
-        if blob is None:
-            blob = self.get_blob()
-
-        self.hydrate_exif(blob)
-
-    def hydrate_exif(self, blob):
-        f = open('tmp', 'wb')
-        f.write(blob)
-        f.close()
-
-        with exiftool.ExifTool() as et:
-            exif = et.get_metadata('tmp')
-            self.exif = json.dumps(exif)
-
-        os.remove('tmp')
-
-@dataclass
-class Image(Media):
-    ocr: str = None
-
-    def hydrate(self, blob=None):
-        if blob is None:
-            blob = self.get_blob()
-
-        super().hydrate(blob)
-        self.hydrate_ocr(blob)
-
-    def hydrate_ocr(self, blob):
-        image = PIL.Image.open(io.BytesIO(blob))
-        self.ocr = pytesseract.image_to_string(image)
-
-@dataclass
-class Video(Media):
-    pass
+                       Column('content', String))
 
 media_table = Table('media', mapper_registry.metadata,
                        Column('id', Integer, primary_key=True,
                               autoincrement=True),
-                        Column('type', String),
+                       Column('type', String),
                        Column('raw_id', Integer, ForeignKey('raw_data.id')),
                        Column('post', Integer, ForeignKey('analysis.id')),
                        Column('url', String),
                        Column('original_url', String),
                        Column('exif', String),
-                       Column('ocr', String)
-                       )
+                       Column('ocr', String))
 
+mapper_registry.map_imperatively(TransformedResult, analysis_table)
+mapper_registry.map_imperatively(ScraperResult, raw_data_table)
 mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
 mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
 mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
\ No newline at end of file
diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py
index f35a13e..6f853c6 100644
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -278,8 +278,8 @@ class ScraperController:
         self.session.configure(bind=self.engine)
 
     def reset_db(self):
+        """Drop all data from the SQLAlchemy database.
+        """
 
         mapper_registry.metadata.drop_all(bind=self.engine)
-        self.connect_to_db(self.engine)
-
-
+        self.connect_to_db(self.engine)
\ No newline at end of file
diff --git a/pytest.ini b/pytest.ini
index 09a94e1..f3545f6 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,6 +1,6 @@
 [pytest]
 minversion =
-  6.0.2
+  7.0.0
 testpaths =
   tests/
 python_files =
@@ -13,4 +13,5 @@ addopts =
   --self-contained-html
 filterwarnings =
     ignore:the imp module is deprecated:DeprecationWarning
-    ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
\ No newline at end of file
+    ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
+    ignore:invalid escape sequence:DeprecationWarning
\ No newline at end of file