diff --git a/src/Pipfile b/src/Pipfile index a8a1936..811903f 100644 --- a/src/Pipfile +++ b/src/Pipfile @@ -21,6 +21,7 @@ fastapi-utils = "*" prometheus-fastapi-instrumentator = "*" auto-archiver = "*" pydantic-settings = "*" +fastapi-mail = "*" [dev-packages] watchdog = "*" diff --git a/src/Pipfile.lock b/src/Pipfile.lock index 7ba5bf6..0d430ab 100644 --- a/src/Pipfile.lock +++ b/src/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "601b24d82095b8b58f1376755d9a1c5c3bbf144aa622e1eef9fc034835f097fe" + "sha256": "14a7ead66a74419eebfa72478bbb7e3efe378df9f41e401738faa2871f5c4344" }, "pipfile-spec": 6, "requires": { @@ -122,6 +122,14 @@ "markers": "python_version >= '3.9'", "version": "==1.3.2" }, + "aiosmtplib": { + "hashes": [ + "sha256:08fd840f9dbc23258025dca229e8a8f04d2ccf3ecb1319585615bfc7933f7f47", + "sha256:8783059603a34834c7c90ca51103c3aa129d5922003b5ce98dbaa6d4440f10fc" + ], + "markers": "python_version >= '3.8'", + "version": "==3.0.2" + }, "aiosqlite": { "hashes": [ "sha256:36a1deaca0cac40ebe32aac9977a6e2bbc7f5189f23f4a54d5908986729e5bd6", @@ -674,6 +682,22 @@ "markers": "python_version >= '3.7'", "version": "==1.2.0" }, + "dnspython": { + "hashes": [ + "sha256:b4c34b7d10b51bcc3a5071e7b8dee77939f1e878477eeecc965e9835f63c6c86", + "sha256:ce9c432eda0dc91cf618a5cedf1a4e142651196bbcd2c80e89ed5a907e5cfaf1" + ], + "markers": "python_version >= '3.9'", + "version": "==2.7.0" + }, + "email-validator": { + "hashes": [ + "sha256:561977c2d73ce3611850a06fa56b414621e0c8faa9d66f2611407d87465da631", + "sha256:cb690f344c617a714f22e66ae771445a1ceb46821152df8e165c5f9a364582b7" + ], + "markers": "python_version >= '3.8'", + "version": "==2.2.0" + }, "exceptiongroup": { "hashes": [ "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", @@ -691,6 +715,15 @@ "markers": "python_version >= '3.8'", "version": "==0.115.6" }, + "fastapi-mail": { + "hashes": [ + "sha256:04bde1005c624f42dfc0a9c1e313fcc544499fdd6b3531e606c500d80ac2ffcb", + "sha256:3525cf342ff91f6bcb3298570d1783498082e586957f668ee4164a0aab6ec743" + ], + "index": "pypi", + "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", + "version": "==1.4.2" + }, "fastapi-utils": { "hashes": [ "sha256:6c4d507a76bab9a016cee0c4fa3a4638c636b2b2689e39c62254b1b2e4e81825", @@ -1867,11 +1900,11 @@ }, "pydantic": { "hashes": [ - "sha256:278b38dbbaec562011d659ee05f63346951b3a248a6f3642e1bc68894ea2b4ff", - "sha256:4dd4e322dbe55472cb7ca7e73f4b63574eecccf2835ffa2af9021ce113c83c53" + "sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584", + "sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236" ], "markers": "python_version >= '3.8'", - "version": "==2.10.5" + "version": "==2.10.6" }, "pydantic-core": { "hashes": [ @@ -2409,11 +2442,11 @@ }, "starlette": { "hashes": [ - "sha256:0e4ab3d16522a255be6b28260b938eae2482f98ce5cc934cb08dce8dc3ba5835", - "sha256:44cedb2b7c77a9de33a8b74b2b90e9f50d11fcf25d8270ea525ad71a25374ff7" + "sha256:2cbcba2a75806f8a41c722141486f37c28e30a0921c5f6fe4346cb0dcee1302f", + "sha256:dfb6d332576f136ec740296c7e8bb8c8a7125044e7c6da30744718880cdd059d" ], - "markers": "python_version >= '3.8'", - "version": "==0.41.3" + "markers": "python_version >= '3.9'", + "version": "==0.45.3" }, "telethon": { "hashes": [ diff --git a/src/core/events.py b/src/core/events.py index 03865d9..d423b16 100644 --- a/src/core/events.py +++ b/src/core/events.py @@ -12,6 +12,7 @@ from db.database import get_db, get_db_async, make_engine from shared.settings import get_settings from utils.metrics import measure_regular_metrics, redis_subscribe_worker_exceptions from worker.main import create_sheet_task +from fastapi_mail import FastMail, MessageSchema, MessageType @asynccontextmanager @@ -35,6 +36,11 @@ async def lifespan(app: FastAPI): else: logger.warning("[CRON] Sheet archive cronjobs are disabled.") + if get_settings().CRON_DELETE_STALE_SHEETS: + asyncio.create_task(delete_stale_sheets()) + else: + logger.warning("[CRON] Delete stale sheets cronjob is disabled.") + yield # separates startup from shutdown instructions # SHUTDOWN @@ -66,3 +72,39 @@ async def archive_sheets_cronjob(frequency: str, interval: int, current_time_uni task = create_sheet_task.apply_async(args=[schemas.SubmitSheet(sheet_id=s.id, author_id=s.author_id, group=s.group_id).model_dump_json()]) triggered_jobs.append({"sheet_id": s.id, "task_id": task.id}) logger.info(f"[CRON {frequency.upper()}:{current_time_unit}] Triggered {len(triggered_jobs)} sheet tasks: {triggered_jobs}") + + +@repeat_every(seconds=86400, wait_first=150, on_exception=logger.error) +async def delete_stale_sheets(): + STALE_DAYS = get_settings().DELETE_STALE_SHEETS_DAYS + logger.info(f"[CRON] Deleting stale sheets older than {STALE_DAYS} days.") + async with get_db_async() as db: + user_sheets = await crud.delete_stale_sheets(db, STALE_DAYS) + + if not user_sheets: return + + fastmail = FastMail(get_settings().MAIL_CONFIG) + # notify users + for email in user_sheets: + list_of_sheets = "\n".join([f'
  • {s.name}
  • ' for s in user_sheets[email]]) + message = MessageSchema( + subject="Auto Archiver: Stale Sheets Removed", + recipients=[email], + body=f""" + + +

    Hi {email},

    +

    Your stale sheets have been removed from our system as no new URL was archived in the past {STALE_DAYS} days:

    + +

    You can always re-add them at https://auto-archiver.bellingcat.com/.

    +

    Best,
    The Auto Archiver team

    + + + """, + subtype=MessageType.html + ) + await fastmail.send_message(message) + logger.info(f"[CRON] Email sent to {email} about stale sheets deletion.") + diff --git a/src/db/crud.py b/src/db/crud.py index 069277d..8e3fb1d 100644 --- a/src/db/crud.py +++ b/src/db/crud.py @@ -261,6 +261,18 @@ async def get_sheets_by_id_hash(db: AsyncSession, frequency: str, modulo: str, i filtered.append(sheet) return filtered +async def delete_stale_sheets(db: AsyncSession, inactivity_days: int) -> dict: + time_threshold = datetime.now() - timedelta(days=inactivity_days) + result = await db.execute( + select(models.Sheet).filter(models.Sheet.last_url_archived_at < time_threshold) + ) + deleted = defaultdict(list) + for sheet in result.scalars(): + await db.delete(sheet) + deleted[sheet.author_id].append(sheet) + await db.commit() + return dict(deleted) + def update_sheet_last_url_archived_at(db: Session, sheet_id: str): db_sheet = db.query(models.Sheet).filter(models.Sheet.id == sheet_id).first() if db_sheet: diff --git a/src/db/database.py b/src/db/database.py index e72dc15..7e4046b 100644 --- a/src/db/database.py +++ b/src/db/database.py @@ -36,16 +36,16 @@ def get_db_dependency(): with get_db() as db: yield db + + # ASYNC connections - - async def make_async_engine(database_url: str) -> AsyncEngine: engine = create_async_engine(database_url, connect_args={"check_same_thread": False}) return engine async def make_async_session_local(engine: AsyncEngine) -> AsyncSession: - return async_sessionmaker(engine, expire_on_commit=False) + return async_sessionmaker(engine, expire_on_commit=False, autoflush=False, autocommit=False) @asynccontextmanager diff --git a/src/shared/settings.py b/src/shared/settings.py index 0a2aec5..5d4b843 100644 --- a/src/shared/settings.py +++ b/src/shared/settings.py @@ -1,5 +1,6 @@ from functools import lru_cache +from fastapi_mail import ConnectionConfig from pydantic_settings import BaseSettings from pydantic import ConfigDict from typing import Annotated, Set @@ -16,6 +17,8 @@ class Settings(BaseSettings): # cronjobs CRON_ARCHIVE_SHEETS: bool = False + CRON_DELETE_STALE_SHEETS: bool = True + DELETE_STALE_SHEETS_DAYS: int = 14 # database DATABASE_PATH: str @@ -39,6 +42,29 @@ class Settings(BaseSettings): #TODO: deprecate blocklist? BLOCKED_EMAILS: Annotated[Set[str], Len(min_length=0)] = set() + # email configuration, if needed + MAIL_FROM: str = "noreply@bellingcat.com" + MAIL_FROM_NAME: str = "Bellingcat's Auto Archiver" + MAIL_USERNAME: str = "" + MAIL_PASSWORD: str = "" + MAIL_SERVER: str = "" + MAIL_PORT: int = 587 + MAIL_STARTTLS: bool = False + MAIL_SSL_TLS: bool = True + @property + def MAIL_CONFIG(self) -> str: + return ConnectionConfig( + MAIL_FROM=self.MAIL_FROM, + MAIL_FROM_NAME=self.MAIL_FROM_NAME, + MAIL_USERNAME=self.MAIL_USERNAME, + MAIL_PASSWORD=self.MAIL_PASSWORD, + MAIL_SERVER=self.MAIL_SERVER, + MAIL_PORT=self.MAIL_PORT, + MAIL_STARTTLS=self.MAIL_STARTTLS, + MAIL_SSL_TLS=self.MAIL_SSL_TLS, + ) + + @lru_cache def get_settings(): return Settings() \ No newline at end of file