auto remove stale sheets and email users

This commit is contained in:
msramalho
2025-02-07 17:57:02 +00:00
parent eccd71d168
commit 83ba9884f6
6 changed files with 125 additions and 11 deletions

View File

@@ -21,6 +21,7 @@ fastapi-utils = "*"
prometheus-fastapi-instrumentator = "*"
auto-archiver = "*"
pydantic-settings = "*"
fastapi-mail = "*"
[dev-packages]
watchdog = "*"

49
src/Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "601b24d82095b8b58f1376755d9a1c5c3bbf144aa622e1eef9fc034835f097fe"
"sha256": "14a7ead66a74419eebfa72478bbb7e3efe378df9f41e401738faa2871f5c4344"
},
"pipfile-spec": 6,
"requires": {
@@ -122,6 +122,14 @@
"markers": "python_version >= '3.9'",
"version": "==1.3.2"
},
"aiosmtplib": {
"hashes": [
"sha256:08fd840f9dbc23258025dca229e8a8f04d2ccf3ecb1319585615bfc7933f7f47",
"sha256:8783059603a34834c7c90ca51103c3aa129d5922003b5ce98dbaa6d4440f10fc"
],
"markers": "python_version >= '3.8'",
"version": "==3.0.2"
},
"aiosqlite": {
"hashes": [
"sha256:36a1deaca0cac40ebe32aac9977a6e2bbc7f5189f23f4a54d5908986729e5bd6",
@@ -674,6 +682,22 @@
"markers": "python_version >= '3.7'",
"version": "==1.2.0"
},
"dnspython": {
"hashes": [
"sha256:b4c34b7d10b51bcc3a5071e7b8dee77939f1e878477eeecc965e9835f63c6c86",
"sha256:ce9c432eda0dc91cf618a5cedf1a4e142651196bbcd2c80e89ed5a907e5cfaf1"
],
"markers": "python_version >= '3.9'",
"version": "==2.7.0"
},
"email-validator": {
"hashes": [
"sha256:561977c2d73ce3611850a06fa56b414621e0c8faa9d66f2611407d87465da631",
"sha256:cb690f344c617a714f22e66ae771445a1ceb46821152df8e165c5f9a364582b7"
],
"markers": "python_version >= '3.8'",
"version": "==2.2.0"
},
"exceptiongroup": {
"hashes": [
"sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b",
@@ -691,6 +715,15 @@
"markers": "python_version >= '3.8'",
"version": "==0.115.6"
},
"fastapi-mail": {
"hashes": [
"sha256:04bde1005c624f42dfc0a9c1e313fcc544499fdd6b3531e606c500d80ac2ffcb",
"sha256:3525cf342ff91f6bcb3298570d1783498082e586957f668ee4164a0aab6ec743"
],
"index": "pypi",
"markers": "python_full_version >= '3.8.1' and python_version < '4.0'",
"version": "==1.4.2"
},
"fastapi-utils": {
"hashes": [
"sha256:6c4d507a76bab9a016cee0c4fa3a4638c636b2b2689e39c62254b1b2e4e81825",
@@ -1867,11 +1900,11 @@
},
"pydantic": {
"hashes": [
"sha256:278b38dbbaec562011d659ee05f63346951b3a248a6f3642e1bc68894ea2b4ff",
"sha256:4dd4e322dbe55472cb7ca7e73f4b63574eecccf2835ffa2af9021ce113c83c53"
"sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584",
"sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236"
],
"markers": "python_version >= '3.8'",
"version": "==2.10.5"
"version": "==2.10.6"
},
"pydantic-core": {
"hashes": [
@@ -2409,11 +2442,11 @@
},
"starlette": {
"hashes": [
"sha256:0e4ab3d16522a255be6b28260b938eae2482f98ce5cc934cb08dce8dc3ba5835",
"sha256:44cedb2b7c77a9de33a8b74b2b90e9f50d11fcf25d8270ea525ad71a25374ff7"
"sha256:2cbcba2a75806f8a41c722141486f37c28e30a0921c5f6fe4346cb0dcee1302f",
"sha256:dfb6d332576f136ec740296c7e8bb8c8a7125044e7c6da30744718880cdd059d"
],
"markers": "python_version >= '3.8'",
"version": "==0.41.3"
"markers": "python_version >= '3.9'",
"version": "==0.45.3"
},
"telethon": {
"hashes": [

View File

@@ -12,6 +12,7 @@ from db.database import get_db, get_db_async, make_engine
from shared.settings import get_settings
from utils.metrics import measure_regular_metrics, redis_subscribe_worker_exceptions
from worker.main import create_sheet_task
from fastapi_mail import FastMail, MessageSchema, MessageType
@asynccontextmanager
@@ -35,6 +36,11 @@ async def lifespan(app: FastAPI):
else:
logger.warning("[CRON] Sheet archive cronjobs are disabled.")
if get_settings().CRON_DELETE_STALE_SHEETS:
asyncio.create_task(delete_stale_sheets())
else:
logger.warning("[CRON] Delete stale sheets cronjob is disabled.")
yield # separates startup from shutdown instructions
# SHUTDOWN
@@ -66,3 +72,39 @@ async def archive_sheets_cronjob(frequency: str, interval: int, current_time_uni
task = create_sheet_task.apply_async(args=[schemas.SubmitSheet(sheet_id=s.id, author_id=s.author_id, group=s.group_id).model_dump_json()])
triggered_jobs.append({"sheet_id": s.id, "task_id": task.id})
logger.info(f"[CRON {frequency.upper()}:{current_time_unit}] Triggered {len(triggered_jobs)} sheet tasks: {triggered_jobs}")
@repeat_every(seconds=86400, wait_first=150, on_exception=logger.error)
async def delete_stale_sheets():
STALE_DAYS = get_settings().DELETE_STALE_SHEETS_DAYS
logger.info(f"[CRON] Deleting stale sheets older than {STALE_DAYS} days.")
async with get_db_async() as db:
user_sheets = await crud.delete_stale_sheets(db, STALE_DAYS)
if not user_sheets: return
fastmail = FastMail(get_settings().MAIL_CONFIG)
# notify users
for email in user_sheets:
list_of_sheets = "\n".join([f'<li><a href="https://docs.google.com/spreadsheets/d/{s.id}">{s.name}</a></li>' for s in user_sheets[email]])
message = MessageSchema(
subject="Auto Archiver: Stale Sheets Removed",
recipients=[email],
body=f"""
<html>
<body>
<p>Hi {email},</p>
<p>Your stale sheets have been removed from our system as no new URL was archived in the past {STALE_DAYS} days:</p>
<ul>
{list_of_sheets}
</ul>
<p>You can always re-add them at https://auto-archiver.bellingcat.com/.</p>
<p>Best,<br>The Auto Archiver team</p>
</body>
</html>
""",
subtype=MessageType.html
)
await fastmail.send_message(message)
logger.info(f"[CRON] Email sent to {email} about stale sheets deletion.")

View File

@@ -261,6 +261,18 @@ async def get_sheets_by_id_hash(db: AsyncSession, frequency: str, modulo: str, i
filtered.append(sheet)
return filtered
async def delete_stale_sheets(db: AsyncSession, inactivity_days: int) -> dict:
time_threshold = datetime.now() - timedelta(days=inactivity_days)
result = await db.execute(
select(models.Sheet).filter(models.Sheet.last_url_archived_at < time_threshold)
)
deleted = defaultdict(list)
for sheet in result.scalars():
await db.delete(sheet)
deleted[sheet.author_id].append(sheet)
await db.commit()
return dict(deleted)
def update_sheet_last_url_archived_at(db: Session, sheet_id: str):
db_sheet = db.query(models.Sheet).filter(models.Sheet.id == sheet_id).first()
if db_sheet:

View File

@@ -36,16 +36,16 @@ def get_db_dependency():
with get_db() as db:
yield db
# ASYNC connections
async def make_async_engine(database_url: str) -> AsyncEngine:
engine = create_async_engine(database_url, connect_args={"check_same_thread": False})
return engine
async def make_async_session_local(engine: AsyncEngine) -> AsyncSession:
return async_sessionmaker(engine, expire_on_commit=False)
return async_sessionmaker(engine, expire_on_commit=False, autoflush=False, autocommit=False)
@asynccontextmanager

View File

@@ -1,5 +1,6 @@
from functools import lru_cache
from fastapi_mail import ConnectionConfig
from pydantic_settings import BaseSettings
from pydantic import ConfigDict
from typing import Annotated, Set
@@ -16,6 +17,8 @@ class Settings(BaseSettings):
# cronjobs
CRON_ARCHIVE_SHEETS: bool = False
CRON_DELETE_STALE_SHEETS: bool = True
DELETE_STALE_SHEETS_DAYS: int = 14
# database
DATABASE_PATH: str
@@ -39,6 +42,29 @@ class Settings(BaseSettings):
#TODO: deprecate blocklist?
BLOCKED_EMAILS: Annotated[Set[str], Len(min_length=0)] = set()
# email configuration, if needed
MAIL_FROM: str = "noreply@bellingcat.com"
MAIL_FROM_NAME: str = "Bellingcat's Auto Archiver"
MAIL_USERNAME: str = ""
MAIL_PASSWORD: str = ""
MAIL_SERVER: str = ""
MAIL_PORT: int = 587
MAIL_STARTTLS: bool = False
MAIL_SSL_TLS: bool = True
@property
def MAIL_CONFIG(self) -> str:
return ConnectionConfig(
MAIL_FROM=self.MAIL_FROM,
MAIL_FROM_NAME=self.MAIL_FROM_NAME,
MAIL_USERNAME=self.MAIL_USERNAME,
MAIL_PASSWORD=self.MAIL_PASSWORD,
MAIL_SERVER=self.MAIL_SERVER,
MAIL_PORT=self.MAIL_PORT,
MAIL_STARTTLS=self.MAIL_STARTTLS,
MAIL_SSL_TLS=self.MAIL_SSL_TLS,
)
@lru_cache
def get_settings():
return Settings()