11 Commits

Author SHA1 Message Date
Felix Spöttel
3cc3c30e03 Bump version to v0.2.1 for release 2022-07-01 18:52:00 +02:00
Felix Spöttel
c9d36c6bf3 feat: print success output 2022-07-01 18:51:25 +02:00
Felix Spöttel
62da0806c7 fix: debug prints 2022-07-01 18:48:17 +02:00
Felix Spöttel
8657bd73ec Bump version to v0.2.0 for release 2022-07-01 18:15:07 +02:00
Felix Spöttel
e633665813 chore: update license 2022-07-01 18:12:00 +02:00
Felix Spöttel
cff5256d06 feat: add --debug flag, improve logging & help
closes #9
2022-07-01 17:53:09 +02:00
Felix Spöttel
4dfa08bbbc feat: add --open flag (#11)
closes #5
2022-07-01 17:08:53 +02:00
Felix Spöttel
eaa4022b70 ci: use pipfile.lock as cache key 2022-07-01 17:05:43 +02:00
Felix Spöttel
1cb5541baa chore: remove clustering print 2022-07-01 17:04:56 +02:00
Felix Spöttel
b40074317c feat: extend kepler.gl color range
closes #10
2022-07-01 17:04:33 +02:00
Miguel Sozinho Ramalho
f1053953ba feat: auto-deploy to pypi (#8) 2022-07-01 15:23:50 +01:00
18 changed files with 2502 additions and 1094 deletions

53
.github/actions/setup-venv/action.yml vendored Normal file
View File

@@ -0,0 +1,53 @@
name: Python virtualenv
description: Set up a Python virtual environment with caching
inputs:
python-version:
description: The Python version to use
required: true
cache-prefix:
description: Update this to invalidate the cache
required: true
default: v0
runs:
using: composite
steps:
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: ${{ inputs.python-version }}
- shell: bash
run: |
# Install prerequisites.
pip install --upgrade pip setuptools wheel virtualenv
- shell: bash
run: |
# Get the exact Python version to use in the cache key.
echo "PYTHON_VERSION=$(python --version)" >> $GITHUB_ENV
- uses: actions/cache@v2
id: virtualenv-cache
with:
path: .venv
key: ${{ inputs.cache-prefix }}-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('Pipfile.lock') }}
- if: steps.virtualenv-cache.outputs.cache-hit != 'true'
shell: bash
run: |
# Set up virtual environment without cache hit.
test -d .venv || virtualenv -p $(which python) --copies --reset-app-data .venv
. .venv/bin/activate
pip install -e .[dev]
- if: steps.virtualenv-cache.outputs.cache-hit == 'true'
shell: bash
run: |
# Set up virtual environment from cache hit.
. .venv/bin/activate
pip install --no-deps -e .[dev]
- shell: bash
run: |
# Show environment info.
. .venv/bin/activate
echo "✓ Installed $(python --version) virtual environment to $(which python)"

View File

@@ -1,10 +0,0 @@
name: Lint
on: [push]
jobs:
black:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: psf/black@stable

113
.github/workflows/main.yml vendored Normal file
View File

@@ -0,0 +1,113 @@
name: Main
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
# on: [push]
on:
pull_request:
branches:
- main
push:
branches:
- main
tags:
- "v*.*.*"
env:
# Change this to invalidate existing cache.
CACHE_PREFIX: v0
PYTHONPATH: ./
jobs:
checks:
name: Python ${{ matrix.python }} - ${{ matrix.task.name }}
runs-on: [ubuntu-latest]
timeout-minutes: 15
strategy:
fail-fast: false
matrix:
include:
- python: "3.10"
task:
name: "Build"
run: |
python setup.py check
python setup.py bdist_wheel sdist
- python: "3.10"
task:
name: "Style"
run: |
black --check .
steps:
- uses: actions/checkout@v3
- name: Setup Python environment
uses: ./.github/actions/setup-venv
with:
python-version: ${{ matrix.python }}
cache-prefix: ${{ env.CACHE_PREFIX }}
- name: ${{ matrix.task.name }}
run: |
. .venv/bin/activate
${{ matrix.task.run }}
- name: Upload package distribution files
if: matrix.task.name == 'Build'
uses: actions/upload-artifact@v3
with:
name: package
path: dist
- name: Clean up
if: always()
run: |
. .venv/bin/activate
pip uninstall -y geoclustering
release:
name: Release
runs-on: ubuntu-latest
needs: [checks]
if: startsWith(github.ref, 'refs/tags/')
steps:
- uses: actions/checkout@v1
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Install requirements
run: |
pip install --upgrade pip setuptools wheel "twine>=1.11.0"
- name: Prepare environment
run: |
echo "RELEASE_VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV
echo "TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
- name: Download package distribution files
uses: actions/download-artifact@v3
with:
name: package
path: dist
- name: Publish package to PyPI
run: |
twine upload -u '${{ secrets.PYPI_USERNAME }}' -p '${{ secrets.PYPI_PASSWORD }}' dist/*
- name: Publish GitHub release
uses: softprops/action-gh-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
# body_path: ${{ github.workspace }}-RELEASE_NOTES.md
prerelease: ${{ contains(env.TAG, 'rc') }}
files: |
dist/*

View File

@@ -1,6 +1,6 @@
MIT License MIT License
Copyright (c) 2022, Felix Spöttel Copyright (c) 2022, Stichting Bellingcat
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

19
Pipfile Normal file
View File

@@ -0,0 +1,19 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
click = "*"
geojson = "*"
keplergl = "*"
numpy = "*"
pandas = "*"
scikit-learn = "*"
[dev-packages]
black = "*"
wheel = "*"
[requires]
python_version = "3.9"

1124
Pipfile.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
# geocluster # geoclustering
> 📍 command-line tool for clustering geolocations. > 📍 command-line tool for clustering geolocations.
@@ -18,8 +18,8 @@ A cluster is created when a certain number of points (=> `--size`) each are with
Clone the repository: Clone the repository:
```sh ```sh
git clone https://github.com/fspoettel/geocluster git clone https://github.com/bellingcat/geoclustering
cd geocluster cd geoclustering
``` ```
Install keplergl build dependencies: Install keplergl build dependencies:
@@ -37,7 +37,13 @@ pip install .
## Usage ## Usage
``` ```
Usage: geocluster [OPTIONS] FILENAME Usage: geoclustering [OPTIONS] FILENAME
Tool to cluster geolocations. A cluster is created when a certain number of
points (--size) each are within a given distance (--distance) of at least
one other point in the cluster. Input is supplied as a csv file. At a
minimum, each row needs to have a 'lat' and a 'lon' column. Other rows are
reflected to the output.
Options: Options:
-d, --distance FLOAT (in km) Max. distance between two points in -d, --distance FLOAT (in km) Max. distance between two points in
@@ -50,6 +56,9 @@ Options:
Clustering algorithm to be used. `optics` Clustering algorithm to be used. `optics`
produces tighter clusters but is slower. produces tighter clusters but is slower.
Default: dbscan Default: dbscan
--open Open the generated visualization in the
default browser automatically.
--debug Print debug output.
--help Show this message and exit. --help Show this message and exit.
``` ```

View File

@@ -1,12 +1,16 @@
from pathlib import Path
import click import click
import os
import webbrowser import webbrowser
import geocluster.clustering as clustering import geoclustering.clustering as clustering
import geocluster.encoding as encoding import geoclustering.encoding as encoding
import geocluster.io as io import geoclustering.io as io
@click.command() @click.command(
help="Tool to cluster geolocations. A cluster is created when a certain number of points (--size) each are within a given distance (--distance) of at least one other point in the cluster. Input is supplied as a csv file. At a minimum, each row needs to have a 'lat' and a 'lon' column. Other rows are reflected to the output."
)
@click.option( @click.option(
"--distance", "--distance",
"-d", "-d",
@@ -38,26 +42,44 @@ import geocluster.io as io
default="dbscan", default="dbscan",
help="Clustering algorithm to be used. `optics` produces tighter clusters but is slower. Default: dbscan", help="Clustering algorithm to be used. `optics` produces tighter clusters but is slower. Default: dbscan",
) )
@click.option(
"--open",
is_flag=True,
help="Open the generated visualization in the default browser automatically.",
)
@click.option("--debug", is_flag=True, help="Print debug output.")
@click.argument("filename", type=click.Path(exists=True)) @click.argument("filename", type=click.Path(exists=True))
def main(distance, size, output, filename, algorithm): def main(distance, size, output, filename, algorithm, open, debug):
def print_debug(s):
if debug:
click.secho(s, fg="bright_black")
df = io.read_csv_file(filename) df = io.read_csv_file(filename)
print_debug(f"Read {len(df)} valid coordinates from {Path(filename).absolute()}")
clusters = clustering.cluster_locations( clusters = clustering.cluster_locations(
df=df, algorithm=algorithm, radius_km=distance, min_cluster_size=size df=df, algorithm=algorithm, radius_km=distance, min_cluster_size=size
) )
if not bool(clusters): if not bool(clusters):
click.echo("Did not find clusters matching input parameters.") click.secho("Did not find clusters matching input parameters.", fg="yellow")
return return
print_debug(f"Found {len(clusters)} valid clusters using {algorithm}")
encoded = encoding.encode_clusters(clusters) encoded = encoding.encode_clusters(clusters)
io.write_output_file(output, "result.txt", encoded["string"]) io.write_output_file(output, "result.txt", encoded["string"])
io.write_output_file(output, "result.json", encoded["json"]) io.write_output_file(output, "result.json", encoded["json"])
io.write_output_file(output, "result.geojson", encoded["geojson"]) io.write_output_file(output, "result.geojson", encoded["geojson"])
vis = io.write_visualization(output, "result.html", encoded["geojson"]) vis = io.write_visualization(output, "result.html", encoded["geojson"])
click.echo(f"Output files saved to {Path(output).absolute()}")
webbrowser.open_new_tab("file://" + str(vis.absolute())) if open:
print_debug(f"Opening visualization in default browser")
webbrowser.open_new_tab("file://" + str(vis.absolute()))
click.secho("Clustering completed.", fg="green")
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -14,8 +14,6 @@ def to_cluster_dict(df, clustering):
""" """
clusters_by_id = {} clusters_by_id = {}
print(clustering.labels_)
for idx, cluster_id in enumerate(clustering.labels_): for idx, cluster_id in enumerate(clustering.labels_):
# ignore "noise" locations that don't belong to any cluster. # ignore "noise" locations that don't belong to any cluster.
if cluster_id > -1: if cluster_id > -1:

View File

@@ -2,9 +2,22 @@ from keplergl import KeplerGl
from pathlib import Path from pathlib import Path
from pkg_resources import resource_filename from pkg_resources import resource_filename
import json import json
import json
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import os
import sys
class HiddenPrints:
"""Disables stdout prints for a block of code."""
def __enter__(self):
self._original_stdout = sys.stdout
sys.stdout = open(os.devnull, "w")
def __exit__(self, exc_type, exc_val, exc_tb):
sys.stdout.close()
sys.stdout = self._original_stdout
def is_valid_lat(val: str) -> bool: def is_valid_lat(val: str) -> bool:
@@ -64,15 +77,21 @@ def write_output_file(dirname, filename, data):
def write_visualization(dirname, filename, data): def write_visualization(dirname, filename, data):
"""Write a visualization, ensuring parent directories.""" """Write a visualization, ensuring parent directories."""
map = KeplerGl() # Hide kepler stdout output.
with HiddenPrints():
map = KeplerGl()
map.add_data(data=data, name="clusters") map.add_data(data=data, name="clusters")
# config configures a default color scheme for our clusters layer. # config configures a default color scheme for our clusters layer.
config_file = resource_filename("geocluster", "kepler_config.json") config_file = resource_filename("geoclustering", "kepler_config.json")
with open(config_file) as f: with open(config_file) as f:
map.config = json.loads(f.read()) map.config = json.loads(f.read())
filepath = ensure_file_path(dirname, filename) filepath = ensure_file_path(dirname, filename)
map.save_to_html(file_name=str(filepath), center_map=True)
# Hide kepler stdout output.
with HiddenPrints():
map.save_to_html(file_name=str(filepath), center_map=True)
return filepath return filepath

View File

@@ -9,7 +9,7 @@
"config": { "config": {
"dataId": "clusters", "dataId": "clusters",
"label": "clusters", "label": "clusters",
"color": [179, 173, 158], "color": [248, 149, 112],
"highlightColor": [252, 242, 26, 255], "highlightColor": [252, 242, 26, 255],
"columns": { "geojson": "_geojson" }, "columns": { "geojson": "_geojson" },
"isVisible": true, "isVisible": true,
@@ -19,16 +19,30 @@
"thickness": 0.5, "thickness": 0.5,
"strokeColor": null, "strokeColor": null,
"colorRange": { "colorRange": {
"name": "Global Warming", "name": "Uber Viz Qualitative 4",
"type": "sequential", "type": "qualitative",
"category": "Uber", "category": "Uber",
"colors": [ "colors": [
"#5A1846", "#12939A",
"#900C3F", "#DDB27C",
"#C70039", "#88572C",
"#E3611C", "#FF991F",
"#F1920E", "#F15C17",
"#FFC300" "#223F9A",
"#DA70BF",
"#125C77",
"#4DC19C",
"#776E57",
"#17B8BE",
"#F6D18A",
"#B7885E",
"#FFCB99",
"#F89570",
"#829AE3",
"#E79FD5",
"#1E96BE",
"#89DAC1",
"#B3AD9E"
] ]
}, },
"strokeColorRange": { "strokeColorRange": {

11
geoclustering/version.py Normal file
View File

@@ -0,0 +1,11 @@
_MAJOR = "0"
_MINOR = "2"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "1"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""
VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)

18
scripts/release.sh Executable file
View File

@@ -0,0 +1,18 @@
#!/bin/bash
set -e
TAG=$(python -c 'from geoclustering.version import VERSION; print("v" + VERSION)')
read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then
git add -A
git commit -m "Bump version to $TAG for release" || true && git push
echo "Creating new git tag $TAG"
git tag "$TAG" -m "$TAG"
git push --tags
else
echo "Cancelled"
exit 1
fi

View File

@@ -1,12 +1,30 @@
from setuptools import setup from setuptools import setup
# version.py defines the VERSION and VERSION_SHORT variables.
# We use exec here so we don't import cached_path whilst setting up.
VERSION = {} # type: ignore
with open("geoclustering/version.py", "r") as version_file:
exec(version_file.read(), VERSION)
setup( setup(
name="geocluster", name="geoclustering",
version="0.1", version=VERSION["VERSION"],
description="", description="📍 command-line tool for clustering geolocations.",
long_description=open("README.md").read(),
long_description_content_type="text/markdown",
classifiers=[
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
],
author="Bellingcat", author="Bellingcat",
packages=["geocluster"], author_email="tech@bellingcat.com",
entry_points={"console_scripts": ["geocluster = geocluster.__main__:main"]}, license="MIT",
packages=["geoclustering"],
package_data={"geoclustering": ["kepler_config.json"]},
keywords=["cluster", "gis", "pattern-analysis"],
entry_points={"console_scripts": ["geoclustering = geoclustering.__main__:main"]},
install_requires=[ install_requires=[
"click", "click",
"geojson", "geojson",