Add new function for insert post (faster/bulk)

This commit is contained in:
Logan Williams
2023-05-04 14:04:55 +02:00
parent 2320ea1efd
commit ebbc6b69dd
11 changed files with 209 additions and 150 deletions

1
.gitignore vendored
View File

@@ -8,6 +8,7 @@ docs/source/_*
*.ipynb *.ipynb
*.db *.db
.env .env
.env*
*.session *.session
*.session-journal *.session-journal
service_account.json service_account.json

42
Dockerfile Normal file
View File

@@ -0,0 +1,42 @@
FROM python:3.9-slim as base
# Setup env
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
ENV PYTHONDONTWRITEBYTECODE 1
ENV PYTHONFAULTHANDLER 1
FROM base AS python-deps
# Install pipenv and compilation dependencies
RUN pip install pipenv
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ git libpq-dev musl-dev
# Install python dependencies in /.venv
COPY Pipfile .
COPY Pipfile.lock .
RUN PIPENV_VENV_IN_PROJECT=1 pipenv install --deploy
FROM base AS runtime
RUN apt-get update && apt-get install -y --no-install-recommends cron libpq-dev
# Copy virtual env from python-deps stage
COPY --from=python-deps /.venv /.venv
ENV PATH="/.venv/bin:$PATH"
# Create and switch to a new user
# RUN useradd --create-home appuser
WORKDIR /root
# USER appuser
# Install application into container
COPY . .
RUN ./spacy_setup.sh
# Copy crontab and start cron in foreground mode
COPY crontab /etc/crontabs/root
CMD ["cron", "-f"]

View File

@@ -28,6 +28,7 @@ ocrd-pyexiftool = "*"
filelock = "*" filelock = "*"
telethon = "*" telethon = "*"
psycopg2 = "*" psycopg2 = "*"
joblib = "*"
[dev-packages] [dev-packages]
pytest = "*" pytest = "*"

226
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "2eb38f729271771b6fd5c72729a49bd99ee5f90edaf16aceb5be0131990a1805" "sha256": "4a8b1578e90c9c6f6cac1ec74dc04df51396fb63c7e3a2f693d5f288c2886cb9"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@@ -92,96 +92,84 @@
}, },
"charset-normalizer": { "charset-normalizer": {
"hashes": [ "hashes": [
"sha256:00d3ffdaafe92a5dc603cb9bd5111aaa36dfa187c8285c543be562e61b755f6b", "sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6",
"sha256:024e606be3ed92216e2b6952ed859d86b4cfa52cd5bc5f050e7dc28f9b43ec42", "sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1",
"sha256:0298eafff88c99982a4cf66ba2efa1128e4ddaca0b05eec4c456bbc7db691d8d", "sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e",
"sha256:02a51034802cbf38db3f89c66fb5d2ec57e6fe7ef2f4a44d070a593c3688667b", "sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373",
"sha256:083c8d17153ecb403e5e1eb76a7ef4babfc2c48d58899c98fcaa04833e7a2f9a", "sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62",
"sha256:0a11e971ed097d24c534c037d298ad32c6ce81a45736d31e0ff0ad37ab437d59", "sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230",
"sha256:0bf2dae5291758b6f84cf923bfaa285632816007db0330002fa1de38bfcb7154", "sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be",
"sha256:0c0a590235ccd933d9892c627dec5bc7511ce6ad6c1011fdf5b11363022746c1", "sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c",
"sha256:0f438ae3532723fb6ead77e7c604be7c8374094ef4ee2c5e03a3a17f1fca256c", "sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0",
"sha256:109487860ef6a328f3eec66f2bf78b0b72400280d8f8ea05f69c51644ba6521a", "sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448",
"sha256:11b53acf2411c3b09e6af37e4b9005cba376c872503c8f28218c7243582df45d", "sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f",
"sha256:12db3b2c533c23ab812c2b25934f60383361f8a376ae272665f8e48b88e8e1c6", "sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649",
"sha256:14e76c0f23218b8f46c4d87018ca2e441535aed3632ca134b10239dfb6dadd6b", "sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d",
"sha256:16a8663d6e281208d78806dbe14ee9903715361cf81f6d4309944e4d1e59ac5b", "sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0",
"sha256:292d5e8ba896bbfd6334b096e34bffb56161c81408d6d036a7dfa6929cff8783", "sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706",
"sha256:2c03cc56021a4bd59be889c2b9257dae13bf55041a3372d3295416f86b295fb5", "sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a",
"sha256:2e396d70bc4ef5325b72b593a72c8979999aa52fb8bcf03f701c1b03e1166918", "sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59",
"sha256:2edb64ee7bf1ed524a1da60cdcd2e1f6e2b4f66ef7c077680739f1641f62f555", "sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23",
"sha256:31a9ddf4718d10ae04d9b18801bd776693487cbb57d74cc3458a7673f6f34639", "sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5",
"sha256:356541bf4381fa35856dafa6a965916e54bed415ad8a24ee6de6e37deccf2786", "sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb",
"sha256:358a7c4cb8ba9b46c453b1dd8d9e431452d5249072e4f56cfda3149f6ab1405e", "sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e",
"sha256:37f8febc8ec50c14f3ec9637505f28e58d4f66752207ea177c1d67df25da5aed", "sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e",
"sha256:39049da0ffb96c8cbb65cbf5c5f3ca3168990adf3551bd1dee10c48fce8ae820", "sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c",
"sha256:39cf9ed17fe3b1bc81f33c9ceb6ce67683ee7526e65fde1447c772afc54a1bb8", "sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28",
"sha256:3ae1de54a77dc0d6d5fcf623290af4266412a7c4be0b1ff7444394f03f5c54e3", "sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d",
"sha256:3b590df687e3c5ee0deef9fc8c547d81986d9a1b56073d82de008744452d6541", "sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41",
"sha256:3e45867f1f2ab0711d60c6c71746ac53537f1684baa699f4f668d4c6f6ce8e14", "sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974",
"sha256:3fc1c4a2ffd64890aebdb3f97e1278b0cc72579a08ca4de8cd2c04799a3a22be", "sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce",
"sha256:4457ea6774b5611f4bed5eaa5df55f70abde42364d498c5134b7ef4c6958e20e", "sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f",
"sha256:44ba614de5361b3e5278e1241fda3dc1838deed864b50a10d7ce92983797fa76", "sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1",
"sha256:4a8fcf28c05c1f6d7e177a9a46a1c52798bfe2ad80681d275b10dcf317deaf0b", "sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d",
"sha256:4b0d02d7102dd0f997580b51edc4cebcf2ab6397a7edf89f1c73b586c614272c", "sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8",
"sha256:502218f52498a36d6bf5ea77081844017bf7982cdbe521ad85e64cabee1b608b", "sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017",
"sha256:503e65837c71b875ecdd733877d852adbc465bd82c768a067badd953bf1bc5a3", "sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31",
"sha256:5995f0164fa7df59db4746112fec3f49c461dd6b31b841873443bdb077c13cfc", "sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7",
"sha256:59e5686dd847347e55dffcc191a96622f016bc0ad89105e24c14e0d6305acbc6", "sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8",
"sha256:601f36512f9e28f029d9481bdaf8e89e5148ac5d89cffd3b05cd533eeb423b59", "sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e",
"sha256:608862a7bf6957f2333fc54ab4399e405baad0163dc9f8d99cb236816db169d4", "sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14",
"sha256:62595ab75873d50d57323a91dd03e6966eb79c41fa834b7a1661ed043b2d404d", "sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd",
"sha256:70990b9c51340e4044cfc394a81f614f3f90d41397104d226f21e66de668730d", "sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d",
"sha256:71140351489970dfe5e60fc621ada3e0f41104a5eddaca47a7acb3c1b851d6d3", "sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795",
"sha256:72966d1b297c741541ca8cf1223ff262a6febe52481af742036a0b296e35fa5a", "sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b",
"sha256:74292fc76c905c0ef095fe11e188a32ebd03bc38f3f3e9bcb85e4e6db177b7ea", "sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b",
"sha256:761e8904c07ad053d285670f36dd94e1b6ab7f16ce62b9805c475b7aa1cffde6", "sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b",
"sha256:772b87914ff1152b92a197ef4ea40efe27a378606c39446ded52c8f80f79702e", "sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203",
"sha256:79909e27e8e4fcc9db4addea88aa63f6423ebb171db091fb4373e3312cb6d603", "sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f",
"sha256:7e189e2e1d3ed2f4aebabd2d5b0f931e883676e51c7624826e0a4e5fe8a0bf24", "sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19",
"sha256:7eb33a30d75562222b64f569c642ff3dc6689e09adda43a082208397f016c39a", "sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1",
"sha256:81d6741ab457d14fdedc215516665050f3822d3e56508921cc7239f8c8e66a58", "sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a",
"sha256:8499ca8f4502af841f68135133d8258f7b32a53a1d594aa98cc52013fff55678", "sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac",
"sha256:84c3990934bae40ea69a82034912ffe5a62c60bbf6ec5bc9691419641d7d5c9a", "sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9",
"sha256:87701167f2a5c930b403e9756fab1d31d4d4da52856143b609e30a1ce7160f3c", "sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0",
"sha256:88600c72ef7587fe1708fd242b385b6ed4b8904976d5da0893e31df8b3480cb6", "sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137",
"sha256:8ac7b6a045b814cf0c47f3623d21ebd88b3e8cf216a14790b455ea7ff0135d18", "sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f",
"sha256:8b8af03d2e37866d023ad0ddea594edefc31e827fee64f8de5611a1dbc373174", "sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6",
"sha256:8c7fe7afa480e3e82eed58e0ca89f751cd14d767638e2550c77a92a9e749c317", "sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5",
"sha256:8eade758719add78ec36dc13201483f8e9b5d940329285edcd5f70c0a9edbd7f", "sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909",
"sha256:911d8a40b2bef5b8bbae2e36a0b103f142ac53557ab421dc16ac4aafee6f53dc", "sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f",
"sha256:93ad6d87ac18e2a90b0fe89df7c65263b9a99a0eb98f0a3d2e079f12a0735837", "sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0",
"sha256:95dea361dd73757c6f1c0a1480ac499952c16ac83f7f5f4f84f0658a01b8ef41", "sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324",
"sha256:9ab77acb98eba3fd2a85cd160851816bfce6871d944d885febf012713f06659c", "sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755",
"sha256:9cb3032517f1627cc012dbc80a8ec976ae76d93ea2b5feaa9d2a5b8882597579", "sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb",
"sha256:9cf4e8ad252f7c38dd1f676b46514f92dc0ebeb0db5552f5f403509705e24753", "sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854",
"sha256:9d9153257a3f70d5f69edf2325357251ed20f772b12e593f3b3377b5f78e7ef8", "sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c",
"sha256:a152f5f33d64a6be73f1d30c9cc82dfc73cec6477ec268e7c6e4c7d23c2d2291", "sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60",
"sha256:a16418ecf1329f71df119e8a65f3aa68004a3f9383821edcb20f0702934d8087", "sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84",
"sha256:a60332922359f920193b1d4826953c507a877b523b2395ad7bc716ddd386d866", "sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0",
"sha256:a8d0fc946c784ff7f7c3742310cc8a57c5c6dc31631269876a88b809dbeff3d3", "sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b",
"sha256:ab5de034a886f616a5668aa5d098af2b5385ed70142090e2a31bcbd0af0fdb3d", "sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1",
"sha256:c22d3fe05ce11d3671297dc8973267daa0f938b93ec716e12e0f6dee81591dc1", "sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531",
"sha256:c2ac1b08635a8cd4e0cbeaf6f5e922085908d48eb05d44c5ae9eabab148512ca", "sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1",
"sha256:c512accbd6ff0270939b9ac214b84fb5ada5f0409c44298361b2f5e13f9aed9e", "sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11",
"sha256:c75ffc45f25324e68ab238cb4b5c0a38cd1c3d7f1fb1f72b5541de469e2247db", "sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326",
"sha256:c95a03c79bbe30eec3ec2b7f076074f4281526724c8685a42872974ef4d36b72", "sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df",
"sha256:cadaeaba78750d58d3cc6ac4d1fd867da6fc73c88156b7a3212a3cd4819d679d", "sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab"
"sha256:cd6056167405314a4dc3c173943f11249fa0f1b204f8b51ed4bde1a9cd1834dc",
"sha256:db72b07027db150f468fbada4d85b3b2729a3db39178abf5c543b784c1254539",
"sha256:df2c707231459e8a4028eabcd3cfc827befd635b3ef72eada84ab13b52e1574d",
"sha256:e62164b50f84e20601c1ff8eb55620d2ad25fb81b59e3cd776a1902527a788af",
"sha256:e696f0dd336161fca9adbb846875d40752e6eba585843c768935ba5c9960722b",
"sha256:eaa379fcd227ca235d04152ca6704c7cb55564116f8bc52545ff357628e10602",
"sha256:ebea339af930f8ca5d7a699b921106c6e29c617fe9606fa7baa043c1cdae326f",
"sha256:f4c39b0e3eac288fedc2b43055cfc2ca7a60362d0e5e87a637beac5d801ef478",
"sha256:f5057856d21e7586765171eac8b9fc3f7d44ef39425f85dbcccb13b3ebea806c",
"sha256:f6f45710b4459401609ebebdbcfb34515da4fc2aa886f95107f556ac69a9147e",
"sha256:f97e83fa6c25693c7a35de154681fcc257c1c41b38beb0304b9c4d2d9e164479",
"sha256:f9d0c5c045a3ca9bedfc35dca8526798eb91a07aa7a2c0fee134c6c6f321cbd7",
"sha256:ff6f3db31555657f3163b15a6b7c6938d08df7adbfc9dd13d9d19edad678f1e8"
], ],
"version": "==3.0.1" "markers": "python_version >= '3.7'",
"version": "==3.1.0"
}, },
"click": { "click": {
"hashes": [ "hashes": [
@@ -261,11 +249,11 @@
}, },
"exceptiongroup": { "exceptiongroup": {
"hashes": [ "hashes": [
"sha256:327cbda3da756e2de031a3107b81ab7b3770a602c4d16ca618298c526f4bec1e", "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e",
"sha256:bcb67d800a4497e1b404c2dd44fca47d3b7a5e5433dbab67f96c1a685cdfdf23" "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"
], ],
"markers": "python_version < '3.11'", "markers": "python_version < '3.11'",
"version": "==1.1.0" "version": "==1.1.1"
}, },
"idna": { "idna": {
"hashes": [ "hashes": [
@@ -381,19 +369,19 @@
}, },
"pathspec": { "pathspec": {
"hashes": [ "hashes": [
"sha256:3a66eb970cbac598f9e5ccb5b2cf58930cd8e3ed86d393d541eaf2d8b1705229", "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687",
"sha256:64d338d4e0914e91c1792321e6907b5a593f1ab1851de7fc269557a21b30ebbc" "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==0.11.0" "version": "==0.11.1"
}, },
"platformdirs": { "platformdirs": {
"hashes": [ "hashes": [
"sha256:8a1228abb1ef82d788f74139988b137e78692984ec7b08eaa6c65f1723af28f9", "sha256:024996549ee88ec1a9aa99ff7f8fc819bb59e2c3477b410d90a16d32d6e707aa",
"sha256:b1d5eb14f221506f50d6604a561f4c5786d9e80355219694a1b244bcd96f4567" "sha256:e5986afb596e4bb5bde29a79ac9061aa955b94fca2399b7aaac4090860920dd8"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==3.0.0" "version": "==3.1.1"
}, },
"pluggy": { "pluggy": {
"hashes": [ "hashes": [
@@ -403,14 +391,6 @@
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6'",
"version": "==1.0.0" "version": "==1.0.0"
}, },
"py": {
"hashes": [
"sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719",
"sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==1.11.0"
},
"pygments": { "pygments": {
"hashes": [ "hashes": [
"sha256:b3ed06a9e8ac9a9aae5a6f5dbe78a8a58655d17b43b93c078f094ddc476ae297", "sha256:b3ed06a9e8ac9a9aae5a6f5dbe78a8a58655d17b43b93c078f094ddc476ae297",
@@ -421,11 +401,11 @@
}, },
"pytest": { "pytest": {
"hashes": [ "hashes": [
"sha256:c7c6ca206e93355074ae32f7403e8ea12163b1163c976fee7d4d84027c162be5", "sha256:130328f552dcfac0b1cec75c12e3f005619dc5f874f0a06e8ff7263f0ee6225e",
"sha256:d45e0952f3727241918b8fd0f376f5ff6b301cc0777c6f9a556935c92d8a7d42" "sha256:c99ab0c73aceb050f68929bc93af19ab6db0558791c6a0715723abe9d0ade9d4"
], ],
"index": "pypi", "index": "pypi",
"version": "==7.2.1" "version": "==7.2.2"
}, },
"pytest-cov": { "pytest-cov": {
"hashes": [ "hashes": [
@@ -437,11 +417,11 @@
}, },
"pytest-html": { "pytest-html": {
"hashes": [ "hashes": [
"sha256:868c08564a68d8b2c26866f1e33178419bb35b1e127c33784a28622eb827f3f3", "sha256:1b8789f0f338f4d7c5cd49d94e9eb15df4c5fecc7e2a7bd07e7040fc4cf675d3",
"sha256:c4e2f4bb0bffc437f51ad2174a8a3e71df81bbc2f6894604e604af18fbe687c3" "sha256:f803a1f93106752575f51a89250af7e01d3470de13657207c6bb5409b5258a29"
], ],
"index": "pypi", "index": "pypi",
"version": "==3.2.0" "version": "==4.0.0rc0"
}, },
"pytest-metadata": { "pytest-metadata": {
"hashes": [ "hashes": [
@@ -459,14 +439,6 @@
"markers": "python_version >= '3.7' and python_version < '4'", "markers": "python_version >= '3.7' and python_version < '4'",
"version": "==2.28.2" "version": "==2.28.2"
}, },
"setuptools": {
"hashes": [
"sha256:e5fd0a713141a4a105412233c63dc4e17ba0090c8e8334594ac790ec97792330",
"sha256:f106dee1b506dee5102cc3f3e9e68137bbad6d47b616be7991714b0c62204251"
],
"markers": "python_version >= '3.7'",
"version": "==67.4.0"
},
"snowballstemmer": { "snowballstemmer": {
"hashes": [ "hashes": [
"sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1",
@@ -516,11 +488,11 @@
}, },
"sphinxcontrib-jquery": { "sphinxcontrib-jquery": {
"hashes": [ "hashes": [
"sha256:8fb65f6dba84bf7bcd1aea1f02ab3955ac34611d838bcc95d4983b805b234daa", "sha256:1620739f04e36a2c779f1a131a2dfd49b2fd07351bf1968ced074365933abc7a",
"sha256:ed47fa425c338ffebe3c37e1cdb56e30eb806116b85f01055b158c7057fdb995" "sha256:f936030d7d0147dd026a4f2b5a57343d233f1fc7b363f68b3d4f1cb0993878ae"
], ],
"markers": "python_version >= '3.1'", "markers": "python_version >= '3.1'",
"version": "==2.0.0" "version": "==4.1"
}, },
"sphinxcontrib-jsmath": { "sphinxcontrib-jsmath": {
"hashes": [ "hashes": [
@@ -564,11 +536,11 @@
}, },
"urllib3": { "urllib3": {
"hashes": [ "hashes": [
"sha256:076907bf8fd355cde77728471316625a4d2f7e713c125f51953bb5b3eecf4f72", "sha256:8a388717b9476f934a21484e8c8e61875ab60644d29b9b39e11e4b9dc1c6b305",
"sha256:75edcdc2f7d85b137124a6c3c9fc3933cdeaa12ecb9a6a959f22797a0feca7e1" "sha256:aa751d169e23c7479ce47a0cb0da579e3ede798f994f5816a74e4f4500dcea42"
], ],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==1.26.14" "version": "==1.26.15"
}, },
"zipp": { "zipp": {
"hashes": [ "hashes": [

View File

@@ -75,6 +75,8 @@ class ETLController:
for analysis by using Transformer objects that have been registered with the controller. for analysis by using Transformer objects that have been registered with the controller.
""" """
posts_to_insert = []
def __init__(self): def __init__(self):
self.transformers = [] self.transformers = []
@@ -107,6 +109,29 @@ class ETLController:
self.session = sessionmaker() self.session = sessionmaker()
self.session.configure(bind=engine) self.session.configure(bind=engine)
# MAY4 can try adding some new functions for batching post inserts
def flush_posts(self, session):
session.bulk_save_objects(self.posts_to_insert)
logger.info(f"Bulk saved {len(self.posts_to_insert)} posts")
self.posts_to_insert = []
def insert_post(self, obj, session, hydrate: bool = True, flush: bool = False):
if hydrate and type(obj) != Video:
obj.hydrate()
if flush:
self.flush_posts()
session.add(obj)
session.flush()
logger.trace(f"Inserted new object {obj}")
return obj
else:
self.posts_to_insert.append(obj)
return None
def insert_or_select(self, obj, session, hydrate: bool = True): def insert_or_select(self, obj, session, hydrate: bool = True):
"""Inserts an object into the database or returns an existing object from the database. """Inserts an object into the database or returns an existing object from the database.
Regardless, the resulting object has an `id` attribute that can be referenced later.""" Regardless, the resulting object has an `id` attribute that can be referenced later."""
@@ -122,7 +147,7 @@ class ETLController:
(Channel.platform==obj.platform)).first() (Channel.platform==obj.platform)).first()
elif type(obj) == Post: elif type(obj) == Post:
instance = None return self.insert_post(obj, session, hydrate)
# instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first() # instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
elif issubclass(type(obj), Media): elif issubclass(type(obj), Media):

View File

@@ -107,8 +107,8 @@ class BitchuteTransformer(Transformer):
video_title = raw['subject'], video_title = raw['subject'],
video_duration = _parse_duration_str(raw['length'])) video_duration = _parse_duration_str(raw['length']))
# insert_post
transformed = insert(transformed) transformed = insert(transformed)
session.flush()
def parse_created(created: str, date_archived: datetime) -> datetime: def parse_created(created: str, date_archived: datetime) -> datetime:
"""Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime """Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime

View File

@@ -79,12 +79,9 @@ class RumbleTransformer(Transformer):
video_title = raw['title'], video_title = raw['title'],
video_duration=_parse_duration_str(raw['duration'])) video_duration=_parse_duration_str(raw['duration']))
# insert_post
insert(transformed) insert(transformed)
# media = self.process_media(raw, transformed.id, data)
# for m in media:
# insert(m)
def _process_number(s): def _process_number(s):
if s is None: if s is None:

View File

@@ -9,7 +9,7 @@ import time
from telethon.sync import TelegramClient from telethon.sync import TelegramClient
from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError
from telethon.tl import types from telethon.tl import types
from telethon.helpers import add_surrogate from telethon.helpers import add_surrogate, del_surrogate
import os import os
from datetime import datetime, timezone from datetime import datetime, timezone
@@ -270,12 +270,23 @@ class TelegramTelethonTransformer(Transformer):
views = raw.get('views') views = raw.get('views')
) )
transformed = insert(transformed) # insert_post
insert(transformed)
def stripped(s):
"""https://stackoverflow.com/a/29933716"""
lstripped = ''.join(takewhile(str.isspace, s))
rstripped = ''.join(reversed(tuple(takewhile(str.isspace, reversed(s)))))
return lstripped + rstripped
def add_markdown_links(raw_post): def add_markdown_links(raw_post):
"""This function is necessary because Telethon's markdown.unparse doesn't
correctly handle trailing whitespace or multi-line links"""
global_offset = 0 global_offset = 0
transformed_content = raw_post['message'] transformed_content = add_surrogate(raw_post['message'])
links = [entity for entity in raw_post['entities'] if entity['_'] == 'MessageEntityTextUrl'] links = [entity for entity in raw_post['entities'] if entity['_'] == 'MessageEntityTextUrl']
for link in links: for link in links:
@@ -284,12 +295,18 @@ def add_markdown_links(raw_post):
url = link['url'] url = link['url']
before_link = transformed_content[:offset] before_link = transformed_content[:offset]
link_text = f"[{transformed_content[offset:offset+length].strip()}]" inner_text = transformed_content[offset:offset+length]
trailing_whitespace = ''.join([c for c in transformed_content[offset:offset+length] if c.isspace()])
# skip creation of link if inner link text is only whitespace
if inner_text.replace('\u200b', '').strip():
processed_inner_text = inner_text.strip().replace('\n', '\\\n')
link_text = f"[{processed_inner_text}]"
trailing_whitespace = stripped(transformed_content[offset:offset+length])
link_href = f"({url})" link_href = f"({url})"
after_link = transformed_content[offset+length:] after_link = transformed_content[offset+length:]
transformed_content = before_link + link_text + link_href + trailing_whitespace + after_link transformed_content = before_link + link_text + link_href + trailing_whitespace + after_link
global_offset += (4 + len(url)) global_offset += (4 + len(url) + inner_text.strip().count('\n'))
return transformed_content return del_surrogate(transformed_content)

View File

@@ -133,8 +133,5 @@ class TwitterTransformer(Transformer):
if raw['quotedTweet'] is not None: if raw['quotedTweet'] is not None:
subtweet(raw['quotedTweet']) subtweet(raw['quotedTweet'])
#insert_post
insert(transformed) insert(transformed)
media = self.process_media(raw, transformed.id, data)
for m in media:
insert(m)

View File

@@ -66,6 +66,7 @@ class VkontakteTransformer(Transformer):
outlinks =list(filter(None, raw["outlinks"])) if raw['outlinks'] else [], outlinks =list(filter(None, raw["outlinks"])) if raw['outlinks'] else [],
) )
# insert_post
insert(transformed) insert(transformed)
# media = self.process_media(raw, transformed.id, data) # media = self.process_media(raw, transformed.id, data)

6
crontab Normal file
View File

@@ -0,0 +1,6 @@
0,10,20,30,40,50 * * * * cd /home/appuser; /usr/bin/flock -w 0 scraper.lock pipenv run python app.py scrape-channels > /proc/1/fd/1 2>/proc/1/fd/2
0 * * * * cd /home/appuser; /usr/bin/timeout -s 2 10800 /usr/bin/flock -w 0 transform.lock pipenv run python app.py transform > /proc/1/fd/1 2>/proc/1/fd/2
0 21 * * * cd /home/appuser; /usr/bin/timeout -s 2 10800 /usr/bin/flock -w 0 scraper.lock pipenv run python app.py channel-info > /proc/1/fd/1 2>/proc/1/fd/2
30 * * * * cd /home/appuser; /usr/bin/flock -w 0 transform.lock pipenv run python app.py transform-info > /proc/1/fd/1 2>/proc/1/fd/2
0 * * * * cd /home/appuser; /usr/bin/flock -w 0 sync.lock pipenv run python app.py sync-channels --gsheet https://docs.google.com/spreadsheets/d/1UnBxtRVkabKHkDUkLxtFOSIX9akytd-rZyV-H_K1PWg/edit > /proc/1/fd/1 2>/proc/1/fd/2
30 * * * * cd /home/appuser; /usr/bin/timeout -s 2 82800 pipenv run python app.py transform-media > /proc/1/fd/1 2>/proc/1/fd/2