Use smaller batches for now

This commit is contained in:
Logan Williams
2022-07-05 09:48:57 +00:00
parent 6149c4279d
commit 51e5ca1f04

View File

@@ -1,5 +1,6 @@
from typing import List, Generator, Union, Callable from typing import List, Generator, Union, Callable
from loguru import logger from loguru import logger
from sqlalchemy import cast, String
from sqlalchemy.orm import sessionmaker, make_transient from sqlalchemy.orm import sessionmaker, make_transient
from sqlalchemy.engine.base import Engine from sqlalchemy.engine.base import Engine
from sqlalchemy.sql.expression import func from sqlalchemy.sql.expression import func
@@ -338,14 +339,14 @@ class ETLController:
session = self.session() session = self.session()
BATCH_SIZE = 50000 BATCH_SIZE = 5000
offset = 0 offset = 0
batch = [] batch = []
query = (session.query(ScraperResult, Post) query = (session.query(ScraperResult, Post)
.join(Post) .join(Post)
.join(Media, isouter=True) .join(Media, isouter=True)
.filter((ScraperResult.media_archived != None) & (ScraperResult.archived_urls != '{}') & (Media.id == None)) .filter((ScraperResult.media_archived != None) & (cast(ScraperResult.archived_urls, String) != '{}') & (Media.id == None))
.order_by(ScraperResult.date.asc()) .order_by(ScraperResult.date.asc())
) )