From 51e5ca1f04df4f8edc3e418e2092d261f03e8b1d Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Tue, 5 Jul 2022 09:48:57 +0000 Subject: [PATCH] Use smaller batches for now --- cisticola/transformer/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index 8d45a2b..be2a5bc 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -1,5 +1,6 @@ from typing import List, Generator, Union, Callable from loguru import logger +from sqlalchemy import cast, String from sqlalchemy.orm import sessionmaker, make_transient from sqlalchemy.engine.base import Engine from sqlalchemy.sql.expression import func @@ -338,14 +339,14 @@ class ETLController: session = self.session() - BATCH_SIZE = 50000 + BATCH_SIZE = 5000 offset = 0 batch = [] query = (session.query(ScraperResult, Post) .join(Post) .join(Media, isouter=True) - .filter((ScraperResult.media_archived != None) & (ScraperResult.archived_urls != '{}') & (Media.id == None)) + .filter((ScraperResult.media_archived != None) & (cast(ScraperResult.archived_urls, String) != '{}') & (Media.id == None)) .order_by(ScraperResult.date.asc()) )