Add new function for insert post (faster/bulk)

This commit is contained in:
Logan Williams
2023-05-04 14:04:55 +02:00
parent 2320ea1efd
commit ebbc6b69dd
11 changed files with 209 additions and 150 deletions

1
.gitignore vendored
View File

@@ -8,6 +8,7 @@ docs/source/_*
*.ipynb
*.db
.env
.env*
*.session
*.session-journal
service_account.json

42
Dockerfile Normal file
View File

@@ -0,0 +1,42 @@
FROM python:3.9-slim as base
# Setup env
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
ENV PYTHONDONTWRITEBYTECODE 1
ENV PYTHONFAULTHANDLER 1
FROM base AS python-deps
# Install pipenv and compilation dependencies
RUN pip install pipenv
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ git libpq-dev musl-dev
# Install python dependencies in /.venv
COPY Pipfile .
COPY Pipfile.lock .
RUN PIPENV_VENV_IN_PROJECT=1 pipenv install --deploy
FROM base AS runtime
RUN apt-get update && apt-get install -y --no-install-recommends cron libpq-dev
# Copy virtual env from python-deps stage
COPY --from=python-deps /.venv /.venv
ENV PATH="/.venv/bin:$PATH"
# Create and switch to a new user
# RUN useradd --create-home appuser
WORKDIR /root
# USER appuser
# Install application into container
COPY . .
RUN ./spacy_setup.sh
# Copy crontab and start cron in foreground mode
COPY crontab /etc/crontabs/root
CMD ["cron", "-f"]

View File

@@ -28,6 +28,7 @@ ocrd-pyexiftool = "*"
filelock = "*"
telethon = "*"
psycopg2 = "*"
joblib = "*"
[dev-packages]
pytest = "*"

226
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "2eb38f729271771b6fd5c72729a49bd99ee5f90edaf16aceb5be0131990a1805"
"sha256": "4a8b1578e90c9c6f6cac1ec74dc04df51396fb63c7e3a2f693d5f288c2886cb9"
},
"pipfile-spec": 6,
"requires": {
@@ -92,96 +92,84 @@
},
"charset-normalizer": {
"hashes": [
"sha256:00d3ffdaafe92a5dc603cb9bd5111aaa36dfa187c8285c543be562e61b755f6b",
"sha256:024e606be3ed92216e2b6952ed859d86b4cfa52cd5bc5f050e7dc28f9b43ec42",
"sha256:0298eafff88c99982a4cf66ba2efa1128e4ddaca0b05eec4c456bbc7db691d8d",
"sha256:02a51034802cbf38db3f89c66fb5d2ec57e6fe7ef2f4a44d070a593c3688667b",
"sha256:083c8d17153ecb403e5e1eb76a7ef4babfc2c48d58899c98fcaa04833e7a2f9a",
"sha256:0a11e971ed097d24c534c037d298ad32c6ce81a45736d31e0ff0ad37ab437d59",
"sha256:0bf2dae5291758b6f84cf923bfaa285632816007db0330002fa1de38bfcb7154",
"sha256:0c0a590235ccd933d9892c627dec5bc7511ce6ad6c1011fdf5b11363022746c1",
"sha256:0f438ae3532723fb6ead77e7c604be7c8374094ef4ee2c5e03a3a17f1fca256c",
"sha256:109487860ef6a328f3eec66f2bf78b0b72400280d8f8ea05f69c51644ba6521a",
"sha256:11b53acf2411c3b09e6af37e4b9005cba376c872503c8f28218c7243582df45d",
"sha256:12db3b2c533c23ab812c2b25934f60383361f8a376ae272665f8e48b88e8e1c6",
"sha256:14e76c0f23218b8f46c4d87018ca2e441535aed3632ca134b10239dfb6dadd6b",
"sha256:16a8663d6e281208d78806dbe14ee9903715361cf81f6d4309944e4d1e59ac5b",
"sha256:292d5e8ba896bbfd6334b096e34bffb56161c81408d6d036a7dfa6929cff8783",
"sha256:2c03cc56021a4bd59be889c2b9257dae13bf55041a3372d3295416f86b295fb5",
"sha256:2e396d70bc4ef5325b72b593a72c8979999aa52fb8bcf03f701c1b03e1166918",
"sha256:2edb64ee7bf1ed524a1da60cdcd2e1f6e2b4f66ef7c077680739f1641f62f555",
"sha256:31a9ddf4718d10ae04d9b18801bd776693487cbb57d74cc3458a7673f6f34639",
"sha256:356541bf4381fa35856dafa6a965916e54bed415ad8a24ee6de6e37deccf2786",
"sha256:358a7c4cb8ba9b46c453b1dd8d9e431452d5249072e4f56cfda3149f6ab1405e",
"sha256:37f8febc8ec50c14f3ec9637505f28e58d4f66752207ea177c1d67df25da5aed",
"sha256:39049da0ffb96c8cbb65cbf5c5f3ca3168990adf3551bd1dee10c48fce8ae820",
"sha256:39cf9ed17fe3b1bc81f33c9ceb6ce67683ee7526e65fde1447c772afc54a1bb8",
"sha256:3ae1de54a77dc0d6d5fcf623290af4266412a7c4be0b1ff7444394f03f5c54e3",
"sha256:3b590df687e3c5ee0deef9fc8c547d81986d9a1b56073d82de008744452d6541",
"sha256:3e45867f1f2ab0711d60c6c71746ac53537f1684baa699f4f668d4c6f6ce8e14",
"sha256:3fc1c4a2ffd64890aebdb3f97e1278b0cc72579a08ca4de8cd2c04799a3a22be",
"sha256:4457ea6774b5611f4bed5eaa5df55f70abde42364d498c5134b7ef4c6958e20e",
"sha256:44ba614de5361b3e5278e1241fda3dc1838deed864b50a10d7ce92983797fa76",
"sha256:4a8fcf28c05c1f6d7e177a9a46a1c52798bfe2ad80681d275b10dcf317deaf0b",
"sha256:4b0d02d7102dd0f997580b51edc4cebcf2ab6397a7edf89f1c73b586c614272c",
"sha256:502218f52498a36d6bf5ea77081844017bf7982cdbe521ad85e64cabee1b608b",
"sha256:503e65837c71b875ecdd733877d852adbc465bd82c768a067badd953bf1bc5a3",
"sha256:5995f0164fa7df59db4746112fec3f49c461dd6b31b841873443bdb077c13cfc",
"sha256:59e5686dd847347e55dffcc191a96622f016bc0ad89105e24c14e0d6305acbc6",
"sha256:601f36512f9e28f029d9481bdaf8e89e5148ac5d89cffd3b05cd533eeb423b59",
"sha256:608862a7bf6957f2333fc54ab4399e405baad0163dc9f8d99cb236816db169d4",
"sha256:62595ab75873d50d57323a91dd03e6966eb79c41fa834b7a1661ed043b2d404d",
"sha256:70990b9c51340e4044cfc394a81f614f3f90d41397104d226f21e66de668730d",
"sha256:71140351489970dfe5e60fc621ada3e0f41104a5eddaca47a7acb3c1b851d6d3",
"sha256:72966d1b297c741541ca8cf1223ff262a6febe52481af742036a0b296e35fa5a",
"sha256:74292fc76c905c0ef095fe11e188a32ebd03bc38f3f3e9bcb85e4e6db177b7ea",
"sha256:761e8904c07ad053d285670f36dd94e1b6ab7f16ce62b9805c475b7aa1cffde6",
"sha256:772b87914ff1152b92a197ef4ea40efe27a378606c39446ded52c8f80f79702e",
"sha256:79909e27e8e4fcc9db4addea88aa63f6423ebb171db091fb4373e3312cb6d603",
"sha256:7e189e2e1d3ed2f4aebabd2d5b0f931e883676e51c7624826e0a4e5fe8a0bf24",
"sha256:7eb33a30d75562222b64f569c642ff3dc6689e09adda43a082208397f016c39a",
"sha256:81d6741ab457d14fdedc215516665050f3822d3e56508921cc7239f8c8e66a58",
"sha256:8499ca8f4502af841f68135133d8258f7b32a53a1d594aa98cc52013fff55678",
"sha256:84c3990934bae40ea69a82034912ffe5a62c60bbf6ec5bc9691419641d7d5c9a",
"sha256:87701167f2a5c930b403e9756fab1d31d4d4da52856143b609e30a1ce7160f3c",
"sha256:88600c72ef7587fe1708fd242b385b6ed4b8904976d5da0893e31df8b3480cb6",
"sha256:8ac7b6a045b814cf0c47f3623d21ebd88b3e8cf216a14790b455ea7ff0135d18",
"sha256:8b8af03d2e37866d023ad0ddea594edefc31e827fee64f8de5611a1dbc373174",
"sha256:8c7fe7afa480e3e82eed58e0ca89f751cd14d767638e2550c77a92a9e749c317",
"sha256:8eade758719add78ec36dc13201483f8e9b5d940329285edcd5f70c0a9edbd7f",
"sha256:911d8a40b2bef5b8bbae2e36a0b103f142ac53557ab421dc16ac4aafee6f53dc",
"sha256:93ad6d87ac18e2a90b0fe89df7c65263b9a99a0eb98f0a3d2e079f12a0735837",
"sha256:95dea361dd73757c6f1c0a1480ac499952c16ac83f7f5f4f84f0658a01b8ef41",
"sha256:9ab77acb98eba3fd2a85cd160851816bfce6871d944d885febf012713f06659c",
"sha256:9cb3032517f1627cc012dbc80a8ec976ae76d93ea2b5feaa9d2a5b8882597579",
"sha256:9cf4e8ad252f7c38dd1f676b46514f92dc0ebeb0db5552f5f403509705e24753",
"sha256:9d9153257a3f70d5f69edf2325357251ed20f772b12e593f3b3377b5f78e7ef8",
"sha256:a152f5f33d64a6be73f1d30c9cc82dfc73cec6477ec268e7c6e4c7d23c2d2291",
"sha256:a16418ecf1329f71df119e8a65f3aa68004a3f9383821edcb20f0702934d8087",
"sha256:a60332922359f920193b1d4826953c507a877b523b2395ad7bc716ddd386d866",
"sha256:a8d0fc946c784ff7f7c3742310cc8a57c5c6dc31631269876a88b809dbeff3d3",
"sha256:ab5de034a886f616a5668aa5d098af2b5385ed70142090e2a31bcbd0af0fdb3d",
"sha256:c22d3fe05ce11d3671297dc8973267daa0f938b93ec716e12e0f6dee81591dc1",
"sha256:c2ac1b08635a8cd4e0cbeaf6f5e922085908d48eb05d44c5ae9eabab148512ca",
"sha256:c512accbd6ff0270939b9ac214b84fb5ada5f0409c44298361b2f5e13f9aed9e",
"sha256:c75ffc45f25324e68ab238cb4b5c0a38cd1c3d7f1fb1f72b5541de469e2247db",
"sha256:c95a03c79bbe30eec3ec2b7f076074f4281526724c8685a42872974ef4d36b72",
"sha256:cadaeaba78750d58d3cc6ac4d1fd867da6fc73c88156b7a3212a3cd4819d679d",
"sha256:cd6056167405314a4dc3c173943f11249fa0f1b204f8b51ed4bde1a9cd1834dc",
"sha256:db72b07027db150f468fbada4d85b3b2729a3db39178abf5c543b784c1254539",
"sha256:df2c707231459e8a4028eabcd3cfc827befd635b3ef72eada84ab13b52e1574d",
"sha256:e62164b50f84e20601c1ff8eb55620d2ad25fb81b59e3cd776a1902527a788af",
"sha256:e696f0dd336161fca9adbb846875d40752e6eba585843c768935ba5c9960722b",
"sha256:eaa379fcd227ca235d04152ca6704c7cb55564116f8bc52545ff357628e10602",
"sha256:ebea339af930f8ca5d7a699b921106c6e29c617fe9606fa7baa043c1cdae326f",
"sha256:f4c39b0e3eac288fedc2b43055cfc2ca7a60362d0e5e87a637beac5d801ef478",
"sha256:f5057856d21e7586765171eac8b9fc3f7d44ef39425f85dbcccb13b3ebea806c",
"sha256:f6f45710b4459401609ebebdbcfb34515da4fc2aa886f95107f556ac69a9147e",
"sha256:f97e83fa6c25693c7a35de154681fcc257c1c41b38beb0304b9c4d2d9e164479",
"sha256:f9d0c5c045a3ca9bedfc35dca8526798eb91a07aa7a2c0fee134c6c6f321cbd7",
"sha256:ff6f3db31555657f3163b15a6b7c6938d08df7adbfc9dd13d9d19edad678f1e8"
"sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6",
"sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1",
"sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e",
"sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373",
"sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62",
"sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230",
"sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be",
"sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c",
"sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0",
"sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448",
"sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f",
"sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649",
"sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d",
"sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0",
"sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706",
"sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a",
"sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59",
"sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23",
"sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5",
"sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb",
"sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e",
"sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e",
"sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c",
"sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28",
"sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d",
"sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41",
"sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974",
"sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce",
"sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f",
"sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1",
"sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d",
"sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8",
"sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017",
"sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31",
"sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7",
"sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8",
"sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e",
"sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14",
"sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd",
"sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d",
"sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795",
"sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b",
"sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b",
"sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b",
"sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203",
"sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f",
"sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19",
"sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1",
"sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a",
"sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac",
"sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9",
"sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0",
"sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137",
"sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f",
"sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6",
"sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5",
"sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909",
"sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f",
"sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0",
"sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324",
"sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755",
"sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb",
"sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854",
"sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c",
"sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60",
"sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84",
"sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0",
"sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b",
"sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1",
"sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531",
"sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1",
"sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11",
"sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326",
"sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df",
"sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab"
],
"version": "==3.0.1"
"markers": "python_version >= '3.7'",
"version": "==3.1.0"
},
"click": {
"hashes": [
@@ -261,11 +249,11 @@
},
"exceptiongroup": {
"hashes": [
"sha256:327cbda3da756e2de031a3107b81ab7b3770a602c4d16ca618298c526f4bec1e",
"sha256:bcb67d800a4497e1b404c2dd44fca47d3b7a5e5433dbab67f96c1a685cdfdf23"
"sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e",
"sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"
],
"markers": "python_version < '3.11'",
"version": "==1.1.0"
"version": "==1.1.1"
},
"idna": {
"hashes": [
@@ -381,19 +369,19 @@
},
"pathspec": {
"hashes": [
"sha256:3a66eb970cbac598f9e5ccb5b2cf58930cd8e3ed86d393d541eaf2d8b1705229",
"sha256:64d338d4e0914e91c1792321e6907b5a593f1ab1851de7fc269557a21b30ebbc"
"sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687",
"sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"
],
"markers": "python_version >= '3.7'",
"version": "==0.11.0"
"version": "==0.11.1"
},
"platformdirs": {
"hashes": [
"sha256:8a1228abb1ef82d788f74139988b137e78692984ec7b08eaa6c65f1723af28f9",
"sha256:b1d5eb14f221506f50d6604a561f4c5786d9e80355219694a1b244bcd96f4567"
"sha256:024996549ee88ec1a9aa99ff7f8fc819bb59e2c3477b410d90a16d32d6e707aa",
"sha256:e5986afb596e4bb5bde29a79ac9061aa955b94fca2399b7aaac4090860920dd8"
],
"markers": "python_version >= '3.7'",
"version": "==3.0.0"
"version": "==3.1.1"
},
"pluggy": {
"hashes": [
@@ -403,14 +391,6 @@
"markers": "python_version >= '3.6'",
"version": "==1.0.0"
},
"py": {
"hashes": [
"sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
"sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.11.0"
},
"pygments": {
"hashes": [
"sha256:b3ed06a9e8ac9a9aae5a6f5dbe78a8a58655d17b43b93c078f094ddc476ae297",
@@ -421,11 +401,11 @@
},
"pytest": {
"hashes": [
"sha256:c7c6ca206e93355074ae32f7403e8ea12163b1163c976fee7d4d84027c162be5",
"sha256:d45e0952f3727241918b8fd0f376f5ff6b301cc0777c6f9a556935c92d8a7d42"
"sha256:130328f552dcfac0b1cec75c12e3f005619dc5f874f0a06e8ff7263f0ee6225e",
"sha256:c99ab0c73aceb050f68929bc93af19ab6db0558791c6a0715723abe9d0ade9d4"
],
"index": "pypi",
"version": "==7.2.1"
"version": "==7.2.2"
},
"pytest-cov": {
"hashes": [
@@ -437,11 +417,11 @@
},
"pytest-html": {
"hashes": [
"sha256:868c08564a68d8b2c26866f1e33178419bb35b1e127c33784a28622eb827f3f3",
"sha256:c4e2f4bb0bffc437f51ad2174a8a3e71df81bbc2f6894604e604af18fbe687c3"
"sha256:1b8789f0f338f4d7c5cd49d94e9eb15df4c5fecc7e2a7bd07e7040fc4cf675d3",
"sha256:f803a1f93106752575f51a89250af7e01d3470de13657207c6bb5409b5258a29"
],
"index": "pypi",
"version": "==3.2.0"
"version": "==4.0.0rc0"
},
"pytest-metadata": {
"hashes": [
@@ -459,14 +439,6 @@
"markers": "python_version >= '3.7' and python_version < '4'",
"version": "==2.28.2"
},
"setuptools": {
"hashes": [
"sha256:e5fd0a713141a4a105412233c63dc4e17ba0090c8e8334594ac790ec97792330",
"sha256:f106dee1b506dee5102cc3f3e9e68137bbad6d47b616be7991714b0c62204251"
],
"markers": "python_version >= '3.7'",
"version": "==67.4.0"
},
"snowballstemmer": {
"hashes": [
"sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1",
@@ -516,11 +488,11 @@
},
"sphinxcontrib-jquery": {
"hashes": [
"sha256:8fb65f6dba84bf7bcd1aea1f02ab3955ac34611d838bcc95d4983b805b234daa",
"sha256:ed47fa425c338ffebe3c37e1cdb56e30eb806116b85f01055b158c7057fdb995"
"sha256:1620739f04e36a2c779f1a131a2dfd49b2fd07351bf1968ced074365933abc7a",
"sha256:f936030d7d0147dd026a4f2b5a57343d233f1fc7b363f68b3d4f1cb0993878ae"
],
"markers": "python_version >= '3.1'",
"version": "==2.0.0"
"version": "==4.1"
},
"sphinxcontrib-jsmath": {
"hashes": [
@@ -564,11 +536,11 @@
},
"urllib3": {
"hashes": [
"sha256:076907bf8fd355cde77728471316625a4d2f7e713c125f51953bb5b3eecf4f72",
"sha256:75edcdc2f7d85b137124a6c3c9fc3933cdeaa12ecb9a6a959f22797a0feca7e1"
"sha256:8a388717b9476f934a21484e8c8e61875ab60644d29b9b39e11e4b9dc1c6b305",
"sha256:aa751d169e23c7479ce47a0cb0da579e3ede798f994f5816a74e4f4500dcea42"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==1.26.14"
"version": "==1.26.15"
},
"zipp": {
"hashes": [

View File

@@ -75,6 +75,8 @@ class ETLController:
for analysis by using Transformer objects that have been registered with the controller.
"""
posts_to_insert = []
def __init__(self):
self.transformers = []
@@ -107,6 +109,29 @@ class ETLController:
self.session = sessionmaker()
self.session.configure(bind=engine)
# MAY4 can try adding some new functions for batching post inserts
def flush_posts(self, session):
session.bulk_save_objects(self.posts_to_insert)
logger.info(f"Bulk saved {len(self.posts_to_insert)} posts")
self.posts_to_insert = []
def insert_post(self, obj, session, hydrate: bool = True, flush: bool = False):
if hydrate and type(obj) != Video:
obj.hydrate()
if flush:
self.flush_posts()
session.add(obj)
session.flush()
logger.trace(f"Inserted new object {obj}")
return obj
else:
self.posts_to_insert.append(obj)
return None
def insert_or_select(self, obj, session, hydrate: bool = True):
"""Inserts an object into the database or returns an existing object from the database.
Regardless, the resulting object has an `id` attribute that can be referenced later."""
@@ -122,7 +147,7 @@ class ETLController:
(Channel.platform==obj.platform)).first()
elif type(obj) == Post:
instance = None
return self.insert_post(obj, session, hydrate)
# instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
elif issubclass(type(obj), Media):
@@ -216,7 +241,7 @@ class ETLController:
if handled == False:
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
@logger.catch(reraise=True)
def transform_all_untransformed(self, hydrate: bool = True, min_id=0):
"""Transform all ScraperResult objects in the database that do not have an

View File

@@ -107,8 +107,8 @@ class BitchuteTransformer(Transformer):
video_title = raw['subject'],
video_duration = _parse_duration_str(raw['length']))
# insert_post
transformed = insert(transformed)
session.flush()
def parse_created(created: str, date_archived: datetime) -> datetime:
"""Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime

View File

@@ -79,12 +79,9 @@ class RumbleTransformer(Transformer):
video_title = raw['title'],
video_duration=_parse_duration_str(raw['duration']))
# insert_post
insert(transformed)
# media = self.process_media(raw, transformed.id, data)
# for m in media:
# insert(m)
def _process_number(s):
if s is None:

View File

@@ -9,7 +9,7 @@ import time
from telethon.sync import TelegramClient
from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError
from telethon.tl import types
from telethon.helpers import add_surrogate
from telethon.helpers import add_surrogate, del_surrogate
import os
from datetime import datetime, timezone
@@ -270,12 +270,23 @@ class TelegramTelethonTransformer(Transformer):
views = raw.get('views')
)
transformed = insert(transformed)
# insert_post
insert(transformed)
def stripped(s):
"""https://stackoverflow.com/a/29933716"""
lstripped = ''.join(takewhile(str.isspace, s))
rstripped = ''.join(reversed(tuple(takewhile(str.isspace, reversed(s)))))
return lstripped + rstripped
def add_markdown_links(raw_post):
"""This function is necessary because Telethon's markdown.unparse doesn't
correctly handle trailing whitespace or multi-line links"""
global_offset = 0
transformed_content = raw_post['message']
transformed_content = add_surrogate(raw_post['message'])
links = [entity for entity in raw_post['entities'] if entity['_'] == 'MessageEntityTextUrl']
for link in links:
@@ -284,12 +295,18 @@ def add_markdown_links(raw_post):
url = link['url']
before_link = transformed_content[:offset]
link_text = f"[{transformed_content[offset:offset+length].strip()}]"
trailing_whitespace = ''.join([c for c in transformed_content[offset:offset+length] if c.isspace()])
link_href = f"({url})"
after_link = transformed_content[offset+length:]
transformed_content = before_link + link_text + link_href + trailing_whitespace + after_link
global_offset += (4 + len(url))
inner_text = transformed_content[offset:offset+length]
return transformed_content
# skip creation of link if inner link text is only whitespace
if inner_text.replace('\u200b', '').strip():
processed_inner_text = inner_text.strip().replace('\n', '\\\n')
link_text = f"[{processed_inner_text}]"
trailing_whitespace = stripped(transformed_content[offset:offset+length])
link_href = f"({url})"
after_link = transformed_content[offset+length:]
transformed_content = before_link + link_text + link_href + trailing_whitespace + after_link
global_offset += (4 + len(url) + inner_text.strip().count('\n'))
return del_surrogate(transformed_content)

View File

@@ -133,8 +133,5 @@ class TwitterTransformer(Transformer):
if raw['quotedTweet'] is not None:
subtweet(raw['quotedTweet'])
insert(transformed)
media = self.process_media(raw, transformed.id, data)
for m in media:
insert(m)
#insert_post
insert(transformed)

View File

@@ -66,6 +66,7 @@ class VkontakteTransformer(Transformer):
outlinks =list(filter(None, raw["outlinks"])) if raw['outlinks'] else [],
)
# insert_post
insert(transformed)
# media = self.process_media(raw, transformed.id, data)

6
crontab Normal file
View File

@@ -0,0 +1,6 @@
0,10,20,30,40,50 * * * * cd /home/appuser; /usr/bin/flock -w 0 scraper.lock pipenv run python app.py scrape-channels > /proc/1/fd/1 2>/proc/1/fd/2
0 * * * * cd /home/appuser; /usr/bin/timeout -s 2 10800 /usr/bin/flock -w 0 transform.lock pipenv run python app.py transform > /proc/1/fd/1 2>/proc/1/fd/2
0 21 * * * cd /home/appuser; /usr/bin/timeout -s 2 10800 /usr/bin/flock -w 0 scraper.lock pipenv run python app.py channel-info > /proc/1/fd/1 2>/proc/1/fd/2
30 * * * * cd /home/appuser; /usr/bin/flock -w 0 transform.lock pipenv run python app.py transform-info > /proc/1/fd/1 2>/proc/1/fd/2
0 * * * * cd /home/appuser; /usr/bin/flock -w 0 sync.lock pipenv run python app.py sync-channels --gsheet https://docs.google.com/spreadsheets/d/1UnBxtRVkabKHkDUkLxtFOSIX9akytd-rZyV-H_K1PWg/edit > /proc/1/fd/1 2>/proc/1/fd/2
30 * * * * cd /home/appuser; /usr/bin/timeout -s 2 82800 pipenv run python app.py transform-media > /proc/1/fd/1 2>/proc/1/fd/2