feat: prototype

2026-06-07 19:18:30 +03:00 · 2022-06-29 18:14:23 +02:00
parent 60e2ff68cc
commit 50f8a872e6
15 changed files with 16765 additions and 0 deletions
--- a/.editorconfig
+++ b/.editorconfig
@@ -0,0 +1,24 @@
+# EditorConfig is awesome: https://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+# Unix-style newlines with a newline ending every file
+[*]
+charset = utf-8
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+
+# 2 space indentation for every file
+[*]
+indent_style = space
+indent_size = 2
+
+# 4 space indentation for python
+[*.py]
+indent_size = 4
+
+# allow trailing whitespace in markdown files
+[*.md]
+trim_trailing_whitespace = false
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,10 @@
+name: Lint
+
+on: [push]
+
+jobs:
+  black:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: psf/black@stable
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,166 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# VSCode files
+.vscode
+
+# output directory
+output/
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022, Felix Spöttel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,130 @@
+# geocluster
+
+> 📍 command-line tool for clustering geolocations.
+
+### Features
+
+ - Uses [DBSCAN](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html) or [OPTICS](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html) to perform clustering.
+ - Outputs clustering results as `json`, `txt` and `geojson`.
+ - Creates a [kepler.gl](https://kepler.gl) visualization of clusters.
+
+### Clustering Method
+
+A cluster is created when a certain number of points (=> `--size`) each are within a given distance (=> `--distance`) of at least one other point in the cluster. 
+
+
+## Install
+
+Clone the repository:
+
+```sh
+git clone https://github.com/fspoettel/geocluster
+cd geocluster
+```
+
+Install keplergl build dependencies:
+
+```sh
+# macos
+brew install proj gdal
+```
+
+Install project with pip:
+```sh
+pip install .
+```
+
+## Usage
+
+```
+Usage: geocluster [OPTIONS] FILENAME
+
+Options:
+  -d, --distance FLOAT            (in km) Max. distance between two points in
+                                  a cluster.  [required]
+  -s, --size INTEGER              Min. number of points in a cluster.
+                                  [required]
+  -o, --output PATH               Output directory for results. Default:
+                                  ./output
+  -a, --algorithm [dbscan|optics]
+                                  Clustering algorithm to be used. `optics`
+                                  produces tighter clusters but is slower.
+                                  Default: dbscan
+  --help                          Show this message and exit.
+```
+
+## Input
+
+Inputs are supplied as a `.csv` file. The only required fields are `lat` and `lon`, all other fields are reflected to the output.
+
+```csv
+id,name,lat,lon
+1,Bonnibelle Mathwen,40.1324085,64.4911086
+...
+```
+
+## Output
+
+If at least one cluster was found, the tool outputs a folder with `json`, `geojson`, `text` and a kepler.gl `html` files.
+
+### JSON
+
+Encodes an array of clusters, each containing an array of points.
+
+```json
+[
+  {
+    "cluster_id": 0,
+    "points": [
+      {
+        "id": 9,
+        "name": "Rosanna Foggo",
+        "lat": -6.2074293,
+        "lon": 106.8915948
+      }
+    ]
+  }
+]
+```
+
+### GeoJSON
+
+Encodes a single `FeatureCollection`, containing all points as `Feature` objects.
+
+```json
+{
+  "type": "FeatureCollection",
+  "features": [
+    {
+      "type": "Feature",
+      "geometry": {
+        "type": "Point",
+        "coordinates": [
+          106.891595,
+          -6.207429
+        ]
+      },
+      "properties": {
+        "id": 9,
+        "name": "Rosanna Foggo",
+        "cluster_id": 0
+      }
+    }
+  ]
+}
+```
+
+### txt
+
+Encodes cluster as blocks separated by a newline, where each line in a cluster block contains one point.
+
+```txt
+Cluster 0
+id 9, name Rosanna Foggo, lat -6.2074293, lon 106.8915948
+
+// ...
+```
+
+### kepler.gl
+
+![kepler.gl instance](https://user-images.githubusercontent.com/1682504/176478177-c0446b51-4060-495c-803d-79e2bbd3e966.png)
--- a/geocluster/init.py
+++ b/geocluster/init.py
--- a/geocluster/cli.py
+++ b/geocluster/cli.py
@@ -0,0 +1,64 @@
+import click
+import webbrowser
+
+import geocluster.clustering as clustering
+import geocluster.encoding as encoding
+import geocluster.io as io
+
+
+@click.command()
+@click.option(
+    "--distance",
+    "-d",
+    type=click.FLOAT,
+    required=True,
+    help="(in km) Max. distance between two points in a cluster.",
+)
+@click.option(
+    "--size",
+    "-s",
+    type=click.INT,
+    required=True,
+    help="Min. number of points in a cluster.",
+)
+@click.option(
+    "--output",
+    "-o",
+    type=click.Path(exists=False),
+    default="output",
+    help="Output directory for results. Default: ./output",
+)
+@click.option(
+    "--algorithm",
+    "-a",
+    type=click.Choice(
+        ["dbscan", "optics"],
+        case_sensitive=False,
+    ),
+    default="dbscan",
+    help="Clustering algorithm to be used. `optics` produces tighter clusters but is slower. Default: dbscan",
+)
+@click.argument("filename", type=click.Path(exists=True))
+def main(distance, size, output, filename, algorithm):
+    df = io.read_csv_file(filename)
+
+    clusters = clustering.cluster_locations(
+        df=df, algorithm=algorithm, radius_km=distance, min_cluster_size=size
+    )
+
+    if not bool(clusters):
+        click.echo("Did not find clusters matching input parameters.")
+        return
+
+    encoded = encoding.encode_clusters(clusters)
+
+    io.write_output_file(output, "result.txt", encoded["string"])
+    io.write_output_file(output, "result.json", encoded["json"])
+    io.write_output_file(output, "result.geojson", encoded["geojson"])
+    vis = io.write_visualization(output, "result.html", encoded["geojson"])
+
+    webbrowser.open_new_tab("file://" + str(vis.absolute()))
+
+
+if __name__ == "__main__":
+    main()
--- a/geocluster/clustering.py
+++ b/geocluster/clustering.py
@@ -0,0 +1,54 @@
+from sklearn.cluster import DBSCAN, OPTICS
+import numpy as np
+
+
+def km_to_radians(km):
+    """Convert kilometer distance to radians."""
+    return km / 6378.1
+
+
+def to_cluster_dict(df, clustering):
+    """
+    Creates a dict <cluster_id, list[dict]>.
+    Each key corresponds to a cluster_id and holds a list of matching location data as dict.
+    """
+    clusters_by_id = {}
+
+    print(clustering.labels_)
+
+    for idx, cluster_id in enumerate(clustering.labels_):
+        # ignore "noise" locations that don't belong to any cluster.
+        if cluster_id > -1:
+            data = df.iloc[idx]
+            clusters_by_id.setdefault(cluster_id, []).append(data.to_dict())
+
+    return clusters_by_id
+
+
+def cluster_locations(df, algorithm, radius_km, min_cluster_size):
+    """
+    Clusters a location dataframe into clusters.
+    A cluster is constructed when there are more than `min_cluster_size locations
+    within `radius_km` of each other.
+    Outputs a dict grouping locations by `cluster_id`.
+    """
+    coordinates = df[["lat", "lon"]]
+    radius_radians = km_to_radians(radius_km)
+
+    if algorithm == "dbscan":
+        clustering = DBSCAN(
+            eps=radius_radians,
+            min_samples=min_cluster_size,
+            metric="haversine",
+            n_jobs=-1,
+        )
+    else:
+        clustering = OPTICS(
+            max_eps=radius_radians,
+            min_samples=min_cluster_size,
+            metric="haversine",
+            n_jobs=-1,
+        )
+
+    X = np.radians(np.array(coordinates))
+    return to_cluster_dict(df, clustering.fit(X))
--- a/geocluster/encoding.py
+++ b/geocluster/encoding.py
@@ -0,0 +1,92 @@
+import json
+import numpy as np
+import geojson
+
+
+class NpEncoder(json.JSONEncoder):
+    """JSONEncoder with support for numpy's numerical types."""
+
+    def default(self, obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        if isinstance(obj, np.floating):
+            return float(obj)
+        return super(NpEncoder, self).default(obj)
+
+
+class StringEncoder:
+    """Encodes clustering result as a string."""
+
+    def __init__(self):
+        self.state = []
+
+    def visitor(self, cluster_id, cluster):
+        self.state.append("Cluster {}".format(cluster_id))
+
+        for record in cluster:
+            s = []
+            for key, val in record.items():
+                s.append("{} {}".format(key, val))
+            self.state.append(", ".join(s))
+
+        # separate clusters by an empty line.
+        self.state.append("")
+
+    def get(self):
+        return "\n".join(self.state)
+
+
+class JSONEncoder:
+    """Encodes clustering result as a JSON array."""
+
+    def __init__(self):
+        self.state = []
+
+    def visitor(self, cluster_id, cluster):
+        cluster_data = {"cluster_id": cluster_id, "points": []}
+
+        for record in cluster:
+            cluster_data["points"].append(record)
+            self.state.append(cluster_data)
+
+    def get(self):
+        return json.dumps(self.state, cls=NpEncoder)
+
+
+class GeoJSONEncoder:
+    def __init__(self):
+        self.state = []
+
+    def visitor(self, cluster_id, cluster):
+        for record in cluster:
+            props = {
+                **record,
+                "cluster_id": cluster_id,
+            }
+
+            lon = props.pop("lon")
+            lat = props.pop("lat")
+
+            point = geojson.Point((lon, lat))
+            self.state.append(geojson.Feature(geometry=point, properties=props))
+
+    def get(self):
+        return json.dumps(geojson.FeatureCollection(self.state), cls=NpEncoder)
+
+
+def encode_clusters(clusters):
+    json_encoder = JSONEncoder()
+    geojson_encoder = GeoJSONEncoder()
+    string_encoder = StringEncoder()
+
+    encoders = [json_encoder, geojson_encoder, string_encoder]
+
+    for cluster_id, cluster in clusters.items():
+        for encoder in encoders:
+            encoder.visitor(cluster_id, cluster)
+
+    return {
+        "json": json_encoder.get(),
+        "geojson": geojson_encoder.get(),
+        "string": string_encoder.get(),
+    }
--- a/geocluster/io.py
+++ b/geocluster/io.py
@@ -0,0 +1,44 @@
+from keplergl import KeplerGl
+from pathlib import Path
+from pkg_resources import resource_filename
+import json
+import json
+import pandas as pd
+
+
+def read_csv_file(filename):
+    """Read input csv file, dropping rows that don't have valid location data."""
+    return pd.read_csv(filename).dropna(subset=["lat", "lon"])
+
+
+def ensure_file_path(dirname, filename):
+    """Ensure a parent directory exists for a file."""
+    path = Path(dirname)
+    path.mkdir(parents=True, exist_ok=True)
+    return path / filename
+
+
+def write_output_file(dirname, filename, data):
+    """Write a file, ensuring parent directories."""
+    filepath = ensure_file_path(dirname, filename)
+
+    with open(filepath, "w") as f:
+        f.write(data)
+
+    return filepath
+
+
+def write_visualization(dirname, filename, data):
+    """Write a visualization, ensuring parent directories."""
+    map = KeplerGl()
+    map.add_data(data=data, name="clusters")
+
+    # config configures a default color scheme for our clusters layer.
+    config_file = resource_filename("geocluster", "kepler_config.json")
+    with open(config_file) as f:
+        map.config = json.loads(f.read())
+
+    filepath = ensure_file_path(dirname, filename)
+    map.save_to_html(file_name=str(filepath), center_map=True)
+
+    return filepath
--- a/geocluster/kepler_config.json
+++ b/geocluster/kepler_config.json
@@ -0,0 +1,86 @@
+{
+  "version": "v1",
+  "config": {
+    "visState": {
+      "filters": [],
+      "layers": [
+        {
+          "type": "geojson",
+          "config": {
+            "dataId": "clusters",
+            "label": "clusters",
+            "color": [179, 173, 158],
+            "highlightColor": [252, 242, 26, 255],
+            "columns": { "geojson": "_geojson" },
+            "isVisible": true,
+            "visConfig": {
+              "opacity": 0.8,
+              "strokeOpacity": 0.8,
+              "thickness": 0.5,
+              "strokeColor": null,
+              "colorRange": {
+                "name": "Global Warming",
+                "type": "sequential",
+                "category": "Uber",
+                "colors": [
+                  "#5A1846",
+                  "#900C3F",
+                  "#C70039",
+                  "#E3611C",
+                  "#F1920E",
+                  "#FFC300"
+                ]
+              },
+              "strokeColorRange": {
+                "name": "Global Warming",
+                "type": "sequential",
+                "category": "Uber",
+                "colors": [
+                  "#5A1846",
+                  "#900C3F",
+                  "#C70039",
+                  "#E3611C",
+                  "#F1920E",
+                  "#FFC300"
+                ]
+              },
+              "radius": 10,
+              "sizeRange": [0, 10],
+              "radiusRange": [0, 50],
+              "heightRange": [0, 500],
+              "elevationScale": 5,
+              "enableElevationZoomFactor": true,
+              "stroked": false,
+              "filled": true,
+              "enable3d": false,
+              "wireframe": false
+            },
+            "hidden": false,
+            "textLabel": [
+              {
+                "field": null,
+                "color": [255, 255, 255],
+                "size": 18,
+                "offset": [0, 0],
+                "anchor": "start",
+                "alignment": "center"
+              }
+            ]
+          },
+          "visualChannels": {
+            "colorField": { "name": "cluster_id", "type": "integer" },
+            "colorScale": "quantile",
+            "strokeColorField": null,
+            "strokeColorScale": "quantile",
+            "sizeField": null,
+            "sizeScale": "linear",
+            "heightField": null,
+            "heightScale": "linear",
+            "radiusField": null,
+            "radiusScale": "linear"
+          }
+        }
+      ]
+    }
+  }
+}
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,21 @@
+from setuptools import setup
+
+setup(
+    name="geocluster",
+    version="0.1",
+    description="",
+    author="Bellingcat",
+    packages=["geocluster"],
+    entry_points={"console_scripts": ["geocluster = geocluster.cli:main"]},
+    install_requires=[
+        "click",
+        "geojson",
+        "keplergl",
+        "numpy",
+        "pandas",
+        "scikit-learn",
+    ],
+    extras_require={"dev": ["black", "wheel"]},
+    include_package_data=True,
+    zip_safe=False,
+)
--- a/tests/fixtures/mock1000.csv
+++ b/tests/fixtures/mock1000.csv
--- a/tests/fixtures/mock15000.csv
+++ b/tests/fixtures/mock15000.csv
--- a/tests/fixtures/mock50.csv
+++ b/tests/fixtures/mock50.csv
@@ -0,0 +1,51 @@
+id,name,lat,lon
+1,Bonnibelle Mathwen,40.1324085,64.4911086
+2,Fayette Elt,49.6235379,6.2379992
+3,Jandy Cooch,-7.5874497,110.7420464
+4,Robb Gerbel,22.2455315,-80.3936994
+5,Silvie Clipson,40.3418956,21.5118754
+6,Kristina Izakoff,30.741991,121.341969
+7,Ricky Sweeting,11.2666664,122.5333328
+8,Quintin Hazart,35.119385,109.167435
+9,Sholom Kilmister,55.7393377,37.6642542
+10,Misty Dooher,49.9776657,20.9421091
+11,Knox Phython,-8.4985,123.5226
+12,Shay Davidy,14.4142191,120.9495257
+13,Dre Benoey,-31.4561755,-64.2111608
+14,Prudi Tomek,40.692169,117.163821
+15,Evey Ealam,31.123586,114.893666
+16,Norry Urch,45.8022541,17.497172
+17,Valerye Dumberell,50.4438122,48.1450932
+18,Freddy Furtado,58.3767785,11.6764538
+19,Catarina Samett,50.4034992,26.141892
+20,Lidia Muckian,-38.7359018,-72.5903739
+21,Stacey Dockrey,29.741986,106.273576
+22,Norri Bonhill,60.6184239,16.7769535
+23,Florence Pretsel,55.96667,25.15
+24,Marten Matantsev,50.9603536,14.3596743
+25,Claiborn Everall,43.884893,-0.5046003
+26,Randolf Hailey,49.4679131,18.2282007
+27,Meggi Kirkebye,57.6888453,11.9943311
+28,Denna Le Grove,16.7124054,98.5746649
+29,Randy Verheijden,40.4722617,-7.9751886
+30,Caterina Blancowe,35.422892,103.352654
+31,Joanne Adamovitch,55.9251242,39.4489055
+32,Orazio Coppins,,111.6556388
+33,Anastassia Bennedsen,45.212088,130.478187
+34,Linoel Ruggier,22.066171,107.781956
+35,Paulina Moralis,-11.806679,-77.1657716
+36,Ambur Outhwaite,59.4033695,17.9443213
+37,Laetitia Aspland,37.6086169,138.9089988
+38,Dew Moxstead,6.1317011,-75.6382657
+39,Berna Klaiser,40.1394691,-8.3092933
+40,Krystle Ingold,7.1518505,0.4738293
+41,Cassaundra Cuffin,56.6342788,36.885813
+42,Malanie Harpin,46.9,109.75
+43,Laurence Stothart,39.912765,116.18362
+44,Luz O'Siaghail,40.4476834,25.5917918
+45,Brittni Garrod,59.0836123,16.18741
+46,Karlie Semrad,-8.793392,121.9330894
+47,Leigh Allderidge,45.768045,15.947739
+48,Ashlin Gogerty,50.3250139,34.9100068
+49,Mozelle De Launde,53.31611,40.70806
+50,Ema le Keux,41.6315023,19.9310781