mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-13 05:48:33 +03:00
Use smaller batches for now
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
from typing import List, Generator, Union, Callable
|
from typing import List, Generator, Union, Callable
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
from sqlalchemy import cast, String
|
||||||
from sqlalchemy.orm import sessionmaker, make_transient
|
from sqlalchemy.orm import sessionmaker, make_transient
|
||||||
from sqlalchemy.engine.base import Engine
|
from sqlalchemy.engine.base import Engine
|
||||||
from sqlalchemy.sql.expression import func
|
from sqlalchemy.sql.expression import func
|
||||||
@@ -338,14 +339,14 @@ class ETLController:
|
|||||||
|
|
||||||
session = self.session()
|
session = self.session()
|
||||||
|
|
||||||
BATCH_SIZE = 50000
|
BATCH_SIZE = 5000
|
||||||
offset = 0
|
offset = 0
|
||||||
batch = []
|
batch = []
|
||||||
|
|
||||||
query = (session.query(ScraperResult, Post)
|
query = (session.query(ScraperResult, Post)
|
||||||
.join(Post)
|
.join(Post)
|
||||||
.join(Media, isouter=True)
|
.join(Media, isouter=True)
|
||||||
.filter((ScraperResult.media_archived != None) & (ScraperResult.archived_urls != '{}') & (Media.id == None))
|
.filter((ScraperResult.media_archived != None) & (cast(ScraperResult.archived_urls, String) != '{}') & (Media.id == None))
|
||||||
.order_by(ScraperResult.date.asc())
|
.order_by(ScraperResult.date.asc())
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user