From dbc564e18b7add31ae929a7de9147f2993a5bc8e Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 10 Feb 2025 22:58:52 +0000 Subject: [PATCH] Add sphinx_book_theme theme to poetry --- README.md | 45 ------------------ docs/source/core_modules.md | 2 +- docs/source/how_to.md | 47 +++++++++++++++++++ docs/source/index.md | 2 +- docs/source/user_guidelines.md | 12 ----- poetry.lock | 86 +++++++++++++++++++++++----------- pyproject.toml | 2 +- 7 files changed, 109 insertions(+), 87 deletions(-) create mode 100644 docs/source/how_to.md delete mode 100644 docs/source/user_guidelines.md diff --git a/README.md b/README.md index bffa9e0..27697e2 100644 --- a/README.md +++ b/README.md @@ -160,48 +160,3 @@ configurations: algorithm: "SHA-256" ``` -## Running on Google Sheets Feeder (gsheet_feeder) -The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs. -This sheet must have been shared with the Google Service account used by `gspread`. -This sheet must also have specific columns (case-insensitive) in the `header` as specified in [gsheet_feeder.__manifest__.py](src/auto_archiver/modules/gsheet_feeder/__manifest__.py). The default names of these columns and their purpose is: - -Inputs: - -* **Link** *(required)*: the URL of the post to archive -* **Destination folder**: custom folder for archived file (regardless of storage) - -Outputs: -* **Archive status** *(required)*: Status of archive operation -* **Archive location**: URL of archived post -* **Archive date**: Date archived -* **Thumbnail**: Embeds a thumbnail for the post in the spreadsheet -* **Timestamp**: Timestamp of original post -* **Title**: Post title -* **Text**: Post text -* **Screenshot**: Link to screenshot of post -* **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification -* **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content -* **WACZ**: Link to a WACZ web archive of post -* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive - -For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive. (Note that the column names are not case sensitive.) - -![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column](docs/demo-before.png) - -Now the auto archiver can be invoked, with this command in this example: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --config secrets/orchestration-global.yaml --gsheet_feeder.sheet "Auto archive test 2023-2"`. Note that the sheet name has been overridden/specified in the command line invocation. - -When the auto archiver starts running, it updates the "Archive status" column. - -![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column. The auto archiver has added "archive in progress" to one of the status columns.](docs/demo-progress.png) - -The links are downloaded and archived, and the spreadsheet is updated to the following: - -![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](docs/demo-after.png) - -Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked. - -The "archive location" link contains the path of the archived file, in local storage, S3, or in Google Drive. - -![The archive result for a link in the demo sheet.](docs/demo-archive.png) - ---- diff --git a/docs/source/core_modules.md b/docs/source/core_modules.md index 8fd548e..bb2b3f1 100644 --- a/docs/source/core_modules.md +++ b/docs/source/core_modules.md @@ -1,6 +1,6 @@ # Module Documentation -These pages describe the core modules that come with `auto-archiver` and provide the basic functionality for archiving websites on the internet. There are five core module types: +These pages describe the core modules that come with `auto-archiver` and provide the main functionality for archiving websites on the internet. There are five core module types: 1. Feeders - these 'feed' information (the URLs) from various sources to the `auto-archiver` for processing 2. Extractors - these 'extract' the page data for a given URL that is fed in by a feeder diff --git a/docs/source/how_to.md b/docs/source/how_to.md new file mode 100644 index 0000000..e8e5d9b --- /dev/null +++ b/docs/source/how_to.md @@ -0,0 +1,47 @@ +# How-To Guides + +## How to use Google Sheets to load and store archive information +The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs. +This sheet must have been shared with the Google Service account used by `gspread`. +This sheet must also have specific columns (case-insensitive) in the `header` as specified in [gsheet_feeder.__manifest__.py](src/auto_archiver/modules/gsheet_feeder/__manifest__.py). The default names of these columns and their purpose is: + +Inputs: + +* **Link** *(required)*: the URL of the post to archive +* **Destination folder**: custom folder for archived file (regardless of storage) + +Outputs: +* **Archive status** *(required)*: Status of archive operation +* **Archive location**: URL of archived post +* **Archive date**: Date archived +* **Thumbnail**: Embeds a thumbnail for the post in the spreadsheet +* **Timestamp**: Timestamp of original post +* **Title**: Post title +* **Text**: Post text +* **Screenshot**: Link to screenshot of post +* **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification +* **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content +* **WACZ**: Link to a WACZ web archive of post +* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive + +For example, this is a spreadsheet configured with all of the columns for the auto archiver and a few URLs to archive. (Note that the column names are not case sensitive.) + +![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column](docs/demo-before.png) + +Now the auto archiver can be invoked, with this command in this example: `docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --config secrets/orchestration-global.yaml --gsheet_feeder.sheet "Auto archive test 2023-2"`. Note that the sheet name has been overridden/specified in the command line invocation. + +When the auto archiver starts running, it updates the "Archive status" column. + +![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Link" column. The auto archiver has added "archive in progress" to one of the status columns.](docs/demo-progress.png) + +The links are downloaded and archived, and the spreadsheet is updated to the following: + +![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](docs/demo-after.png) + +Note that the first row is skipped, as it is assumed to be a header row (`--gsheet_feeder.header=1` and you can change it if you use more rows above). Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked. + +The "archive location" link contains the path of the archived file, in local storage, S3, or in Google Drive. + +![The archive result for a link in the demo sheet.](docs/demo-archive.png) + +--- diff --git a/docs/source/index.md b/docs/source/index.md index 0c64a13..bcab52a 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -8,7 +8,7 @@ :caption: Contents: Overview -user_guidelines +how_to installation/installation.rst core_modules.md development/developer_guidelines diff --git a/docs/source/user_guidelines.md b/docs/source/user_guidelines.md deleted file mode 100644 index 21f64a6..0000000 --- a/docs/source/user_guidelines.md +++ /dev/null @@ -1,12 +0,0 @@ - -# User Guidelines - - -This section of the documentation provides guidelines for users who want to use the tool, -without needing to modify the code. -To see the developer guidelines, see the [](development/developer_guidelines) - -```{note} -This is a work in progress. -``` - \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 8fb48ec..8748ea7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,24 @@ # This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. +[[package]] +name = "accessible-pygments" +version = "0.0.5" +description = "A collection of accessible pygments styles" +optional = false +python-versions = ">=3.9" +groups = ["docs"] +files = [ + {file = "accessible_pygments-0.0.5-py3-none-any.whl", hash = "sha256:88ae3211e68a1d0b011504b2ffc1691feafce124b845bd072ab6f9f66f34d4b7"}, + {file = "accessible_pygments-0.0.5.tar.gz", hash = "sha256:40918d3e6a2b619ad424cb91e556bd3bd8865443d9f22f1dcdf79e33c8046872"}, +] + +[package.dependencies] +pygments = ">=1.5" + +[package.extras] +dev = ["pillow", "pkginfo (>=1.10)", "playwright", "pre-commit", "setuptools", "twine (>=5.0)"] +tests = ["hypothesis", "pytest"] + [[package]] name = "alabaster" version = "1.0.0" @@ -722,24 +741,6 @@ future = "*" [package.extras] dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"] -[[package]] -name = "furo" -version = "2024.8.6" -description = "A clean customisable Sphinx documentation theme." -optional = false -python-versions = ">=3.8" -groups = ["docs"] -files = [ - {file = "furo-2024.8.6-py3-none-any.whl", hash = "sha256:6cd97c58b47813d3619e63e9081169880fbe331f0ca883c871ff1f3f11814f5c"}, - {file = "furo-2024.8.6.tar.gz", hash = "sha256:b63e4cee8abfc3136d3bc03a3d45a76a850bada4d6374d24c1716b0e01394a01"}, -] - -[package.dependencies] -beautifulsoup4 = "*" -pygments = ">=2.7" -sphinx = ">=6.0,<9.0" -sphinx-basic-ng = ">=1.0.0.beta2" - [[package]] name = "future" version = "1.0.0" @@ -1653,6 +1654,34 @@ files = [ {file = "pycryptodomex-3.21.0.tar.gz", hash = "sha256:222d0bd05381dd25c32dd6065c071ebf084212ab79bab4599ba9e6a3e0009e6c"}, ] +[[package]] +name = "pydata-sphinx-theme" +version = "0.16.1" +description = "Bootstrap-based Sphinx theme from the PyData community" +optional = false +python-versions = ">=3.9" +groups = ["docs"] +files = [ + {file = "pydata_sphinx_theme-0.16.1-py3-none-any.whl", hash = "sha256:225331e8ac4b32682c18fcac5a57a6f717c4e632cea5dd0e247b55155faeccde"}, + {file = "pydata_sphinx_theme-0.16.1.tar.gz", hash = "sha256:a08b7f0b7f70387219dc659bff0893a7554d5eb39b59d3b8ef37b8401b7642d7"}, +] + +[package.dependencies] +accessible-pygments = "*" +Babel = "*" +beautifulsoup4 = "*" +docutils = "!=0.17.0" +pygments = ">=2.7" +sphinx = ">=6.1" +typing-extensions = "*" + +[package.extras] +a11y = ["pytest-playwright"] +dev = ["pandoc", "pre-commit", "pydata-sphinx-theme[doc,test]", "pyyaml", "sphinx-theme-builder[cli]", "tox"] +doc = ["ablog (>=0.11.8)", "colorama", "graphviz", "ipykernel", "ipyleaflet", "ipywidgets", "jupyter_sphinx", "jupyterlite-sphinx", "linkify-it-py", "matplotlib", "myst-parser", "nbsphinx", "numpy", "numpydoc", "pandas", "plotly", "rich", "sphinx-autoapi (>=3.0.0)", "sphinx-copybutton", "sphinx-design", "sphinx-favicon (>=1.0.1)", "sphinx-sitemap", "sphinx-togglebutton", "sphinxcontrib-youtube (>=1.4.1)", "sphinxext-rediraffe", "xarray"] +i18n = ["Babel", "jinja2"] +test = ["pytest", "pytest-cov", "pytest-regressions", "sphinx[test]"] + [[package]] name = "pygments" version = "2.19.1" @@ -2359,22 +2388,25 @@ websockets = ">=11" test = ["httpx", "pytest (>=6)"] [[package]] -name = "sphinx-basic-ng" -version = "1.0.0b2" -description = "A modern skeleton for Sphinx themes." +name = "sphinx-book-theme" +version = "1.1.3" +description = "A clean book theme for scientific explanations and documentation with Sphinx" optional = false -python-versions = ">=3.7" +python-versions = ">=3.9" groups = ["docs"] files = [ - {file = "sphinx_basic_ng-1.0.0b2-py3-none-any.whl", hash = "sha256:eb09aedbabfb650607e9b4b68c9d240b90b1e1be221d6ad71d61c52e29f7932b"}, - {file = "sphinx_basic_ng-1.0.0b2.tar.gz", hash = "sha256:9ec55a47c90c8c002b5960c57492ec3021f5193cb26cebc2dc4ea226848651c9"}, + {file = "sphinx_book_theme-1.1.3-py3-none-any.whl", hash = "sha256:a554a9a7ac3881979a87a2b10f633aa2a5706e72218a10f71be38b3c9e831ae9"}, + {file = "sphinx_book_theme-1.1.3.tar.gz", hash = "sha256:1f25483b1846cb3d353a6bc61b3b45b031f4acf845665d7da90e01ae0aef5b4d"}, ] [package.dependencies] -sphinx = ">=4.0" +pydata-sphinx-theme = ">=0.15.2" +sphinx = ">=5" [package.extras] -docs = ["furo", "ipython", "myst-parser", "sphinx-copybutton", "sphinx-inline-tabs"] +code-style = ["pre-commit"] +doc = ["ablog", "folium", "ipywidgets", "matplotlib", "myst-nb", "nbclient", "numpy", "numpydoc", "pandas", "plotly", "sphinx-copybutton", "sphinx-design", "sphinx-examples", "sphinx-tabs", "sphinx-thebe", "sphinx-togglebutton", "sphinxcontrib-bibtex", "sphinxcontrib-youtube", "sphinxext-opengraph"] +test = ["beautifulsoup4", "coverage", "defusedxml", "myst-nb", "pytest", "pytest-cov", "pytest-regressions", "sphinx_thebe"] [[package]] name = "sphinx-copybutton" @@ -3100,4 +3132,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "9ca114395e73af8982abbccc25b385bbca62e50ba7cca8239e52e5c1227cb4b0" +content-hash = "432fe98be0e17791a047396646177cb8aaf6590c6d6247829664ed6fc1f84428" diff --git a/pyproject.toml b/pyproject.toml index f1be273..f025842 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,7 @@ sphinxcontrib-mermaid = "^1.0.0" sphinx-autobuild = "^2024.10.3" sphinx-copybutton = "^0.5.2" myst-parser = "^4.0.0" -furo = "^2024.8.6" +sphinx-book-theme = "^1.1.3" [project.scripts]