Bump version to v0.3.0 for release

feat: optional kepler.gl integration (#12 )
test: add test suite (#7 )
2026-06-12 13:38:29 +03:00 · 2022-07-04 16:43:48 +02:00 · 2022-07-04 16:22:27 +02:00 · 2022-07-04 13:54:07 +02:00 · 2022-07-01 19:18:06 +02:00 · 2022-07-01 18:57:40 +02:00
32 changed files with 3041 additions and 1238 deletions
--- a/.github/actions/setup-venv/action.yml
+++ b/.github/actions/setup-venv/action.yml
@@ -0,0 +1,53 @@
 name: Python virtualenv
 description: Set up a Python virtual environment with caching
 inputs:
  python-version:
    description: The Python version to use
    required: true
  cache-prefix:
    description: Update this to invalidate the cache
    required: true
    default: v0
 runs:
  using: composite
  steps:
    - name: Setup Python
      uses: actions/setup-python@v4
      with:
        python-version: ${{ inputs.python-version }}
    - shell: bash
      run: |
        # Install prerequisites.
        pip install --upgrade pip setuptools wheel virtualenv
    - shell: bash
      run: |
        # Get the exact Python version to use in the cache key.
        echo "PYTHON_VERSION=$(python --version)" >> $GITHUB_ENV
    - uses: actions/cache@v2
      id: virtualenv-cache
      with:
        path: .venv
        key: ${{ inputs.cache-prefix }}-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('Pipfile.lock') }}
    - if: steps.virtualenv-cache.outputs.cache-hit != 'true'
      shell: bash
      run: |
        # Set up virtual environment without cache hit.
        test -d .venv || virtualenv -p $(which python) --copies --reset-app-data .venv
        . .venv/bin/activate
        pip install -e .[dev]
    - if: steps.virtualenv-cache.outputs.cache-hit == 'true'
      shell: bash
      run: |
        # Set up virtual environment from cache hit.
        . .venv/bin/activate
        pip install --no-deps -e .[dev]
    - shell: bash
      run: |
        # Show environment info.
        . .venv/bin/activate
        echo "✓ Installed $(python --version) virtual environment to $(which python)"
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -1,10 +0,0 @@
 name: Lint
 on: [push]
 jobs:
  black:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - uses: psf/black@stable
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -0,0 +1,117 @@
 name: Main
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 # on: [push]
 on:
  pull_request:
    branches:
      - main
  push:
    branches:
      - main
    tags:
      - "v*.*.*"
 env:
  # Change this to invalidate existing cache.
  CACHE_PREFIX: v0
  PYTHONPATH: ./
 jobs:
  checks:
    name: Python ${{ matrix.python }} - ${{ matrix.task.name }}
    runs-on: [ubuntu-latest]
    timeout-minutes: 15
    strategy:
      fail-fast: false
      matrix:
        include:
          - python: "3.10"
            task:
              name: "Build"
              run: |
                python setup.py check
                python setup.py bdist_wheel sdist
          - python: "3.10"
            task:
              name: "Style"
              run: |
                black --check .
          - python: "3.10"
            task:
              name: "Test"
              run: pytest --exitfirst --failed-first
    steps:
      - uses: actions/checkout@v3
      - name: Setup Python environment
        uses: ./.github/actions/setup-venv
        with:
          python-version: ${{ matrix.python }}
          cache-prefix: ${{ env.CACHE_PREFIX }}
      - name: ${{ matrix.task.name }}
        run: |
          . .venv/bin/activate
          ${{ matrix.task.run }}
      - name: Upload package distribution files
        if: matrix.task.name == 'Build'
        uses: actions/upload-artifact@v3
        with:
          name: package
          path: dist
      - name: Clean up
        if: always()
        run: |
          . .venv/bin/activate
          pip uninstall -y geoclustering
  release:
    name: Release
    runs-on: ubuntu-latest
    needs: [checks]
    if: startsWith(github.ref, 'refs/tags/')
    steps:
      - uses: actions/checkout@v1
      - name: Setup Python
        uses: actions/setup-python@v4
        with:
          python-version: "3.10"
      - name: Install requirements
        run: |
          pip install --upgrade pip setuptools wheel "twine>=1.11.0"
      - name: Prepare environment
        run: |
          echo "RELEASE_VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV
          echo "TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
      - name: Download package distribution files
        uses: actions/download-artifact@v3
        with:
          name: package
          path: dist
      - name: Publish package to PyPI
        run: |
          twine upload -u '${{ secrets.PYPI_USERNAME }}' -p '${{ secrets.PYPI_PASSWORD }}' dist/*
      - name: Publish GitHub release
        uses: softprops/action-gh-release@v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
          # body_path: ${{ github.workspace }}-RELEASE_NOTES.md
          prerelease: ${{ contains(env.TAG, 'rc') }}
          files: |
            dist/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,10 @@
 repos:
  - repo: https://github.com/psf/black
    rev: 22.3.0
    hooks:
      - id: black
        # It is recommended to specify the latest version of Python
        # supported by your project here, or alternatively use
        # pre-commit's default_language_version, see
        # https://pre-commit.com/#top_level-default_language_version
        language_version: python3.9
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2022, Felix Spöttel
+Copyright (c) 2022, Stichting Bellingcat
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
 [[source]]
 url = "https://pypi.org/simple"
 verify_ssl = true
 name = "pypi"
 [packages]
 click = "*"
 geojson = "*"
 keplergl = "*"
 numpy = "*"
 pandas = "*"
 scikit-learn = "*"
 [dev-packages]
 black = "*"
 pre-commit = "*"
 pytest = "*"
 wheel = "*"
 [requires]
 python_version = "3.9"
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# geocluster
+# geoclustering
 > 📍 command-line tool for clustering geolocations.
@@ -10,34 +10,38 @@
 ### Clustering Method
-A cluster is created when a certain number of points (=> `--size`) each are within a given distance (=> `--distance`) of at least one other point in the cluster. 
+A cluster is created when a certain number of points (defined with `--size`) each are within a given distance (defined with `--distance`) of at least one other point in the cluster. 
 ## Install
-Clone the repository:
+Install with pip:
 ```sh
-git clone https://github.com/fspoettel/geocluster
+# with kepler.gl visualization support
-cd geocluster
+pip install geoclustering[full]
 # only text-based output
 pip install geoclustering
 ```
-Install keplergl build dependencies:
+If the `full` install fails, you might need to install kepler.gl build dependencies:
 ```sh
 # macos
 brew install proj gdal
 ```
 Install project with pip:
 ```sh
 pip install .
 ```
 ## Usage
 ```
-Usage: geocluster [OPTIONS] FILENAME
+Usage: geoclustering [OPTIONS] FILENAME
  Tool to cluster geolocations. A cluster is created when a certain number of
  points (defined with --size) each are within a given distance (defined with
  --distance) of at least one other point in the cluster. Input is supplied as
  a csv file. At a minimum, each row needs to have a 'lat' and a 'lon' column.
  Other rows are reflected to the output.
 Options:
  -d, --distance FLOAT            (in km) Max. distance between two points in
@@ -50,12 +54,15 @@ Options:
                                  Clustering algorithm to be used. `optics`
                                  produces tighter clusters but is slower.
                                  Default: dbscan
  --open                          Open the generated visualization in the
                                  default browser automatically.
  --debug                         Print debug output.
  --help                          Show this message and exit.
 ```
 ## Input
-Inputs are supplied as a `.csv` file. The only required fields are `lat` and `lon`, all other fields are reflected to the output.
+Inputs are supplied as a `.csv` file. At a minimum, each row needs to have a `lat` and a `lon`` column. Other rows are reflected to the output.
 ```csv
 id,name,lat,lon
@@ -65,7 +72,7 @@ id,name,lat,lon
 ## Output
-If at least one cluster was found, the tool outputs a folder with `json`, `geojson`, `text` and a kepler.gl `html` files.
+If at least one cluster was found, the tool outputs a folder with output as `json`, `geojson`, `txt` files. A kepler.gl `html` file is generated as well.
 ### JSON
@@ -114,7 +121,7 @@ Encodes a single `FeatureCollection`, containing all points as `Feature` objects
 }
 ```
-### txt
+### Text
 Encodes cluster as blocks separated by a newline, where each line in a cluster block contains one point.
@@ -128,3 +135,17 @@ id 9, name Rosanna Foggo, lat -6.2074293, lon 106.8915948
 ### kepler.gl
 ![kepler.gl instance](https://user-images.githubusercontent.com/1682504/176478177-c0446b51-4060-495c-803d-79e2bbd3e966.png)
 ## Develop
 It is assumed that you are using **Python3.9+**. It is encouraged to [setup a virtualenv](https://wiki.archlinux.org/title/Python/Virtual_environment#venv>) for development.
 ```sh
    # install dependencies & dev-dependencies
    pip install -e .[dev,full]
    # install a git hook that runs the code formatter before each commit.
    pre-commit install
 ```
 We use [Black](https://github.com/psf/black) as our code formatter. If you don't want to use the `pre-commit` hook, you can run the formatter manually or via an editor plugin.
--- a/geocluster/main.py
+++ b/geocluster/main.py
@@ -1,64 +0,0 @@
 import click
 import webbrowser
 import geocluster.clustering as clustering
 import geocluster.encoding as encoding
 import geocluster.io as io
@click.command()
@click.option(
    "--distance",
    "-d",
    type=click.FLOAT,
    required=True,
    help="(in km) Max. distance between two points in a cluster.",
 )
@click.option(
    "--size",
    "-s",
    type=click.INT,
    required=True,
    help="Min. number of points in a cluster.",
 )
@click.option(
    "--output",
    "-o",
    type=click.Path(exists=False),
    default="output",
    help="Output directory for results. Default: ./output",
 )
@click.option(
    "--algorithm",
    "-a",
    type=click.Choice(
        ["dbscan", "optics"],
        case_sensitive=False,
    ),
    default="dbscan",
    help="Clustering algorithm to be used. `optics` produces tighter clusters but is slower. Default: dbscan",
 )
@click.argument("filename", type=click.Path(exists=True))
 def main(distance, size, output, filename, algorithm):
    df = io.read_csv_file(filename)
    clusters = clustering.cluster_locations(
        df=df, algorithm=algorithm, radius_km=distance, min_cluster_size=size
    )
    if not bool(clusters):
        click.echo("Did not find clusters matching input parameters.")
        return
    encoded = encoding.encode_clusters(clusters)
    io.write_output_file(output, "result.txt", encoded["string"])
    io.write_output_file(output, "result.json", encoded["json"])
    io.write_output_file(output, "result.geojson", encoded["geojson"])
    vis = io.write_visualization(output, "result.html", encoded["geojson"])
    webbrowser.open_new_tab("file://" + str(vis.absolute()))
 if __name__ == "__main__":
    main()
--- a/geocluster/io.py
+++ b/geocluster/io.py
@@ -1,78 +0,0 @@
 from keplergl import KeplerGl
 from pathlib import Path
 from pkg_resources import resource_filename
 import json
 import json
 import pandas as pd
 import numpy as np
 def is_valid_lat(val: str) -> bool:
    """Given a string, check if it corresponds to a valid decimal latitude value"""
    try:
        val = float(val)
        return val >= -90 and val <= 90
    except:
        return False
 def is_valid_lon(val: str) -> bool:
    """Given a string, check if it corresponds to a valid decimal longitude value"""
    try:
        val = float(val)
        return val >= -180 and val <= 180
    except:
        return False
 def read_csv_file(filename):
    """Read input csv file, dropping rows that don't have valid location data."""
    df = pd.read_csv(filename)
    initial_rows = len(df)
    df = df.dropna(subset=["lat", "lon"])
    df = df.replace(
        {np.nan: None}
    )  # replace for other fields not to break kepler parsing
    print(f"Ignored {initial_rows - len(df)} coordinates with NaN")
    valid_index = df.lat.astype(str).apply(is_valid_lat) & df.lon.astype(str).apply(
        is_valid_lon
    )
    if len(df_invalid := df[~valid_index]):
        print(f"Found {len(df_invalid)} invalid coordinate pairs, ignoring:")
        print(df_invalid[["lat", "lon"]].to_string())
    return df[valid_index]
 def ensure_file_path(dirname, filename):
    """Ensure a parent directory exists for a file."""
    path = Path(dirname)
    path.mkdir(parents=True, exist_ok=True)
    return path / filename
 def write_output_file(dirname, filename, data):
    """Write a file, ensuring parent directories."""
    filepath = ensure_file_path(dirname, filename)
    with open(filepath, "w") as f:
        f.write(data)
    return filepath
 def write_visualization(dirname, filename, data):
    """Write a visualization, ensuring parent directories."""
    map = KeplerGl()
    map.add_data(data=data, name="clusters")
    # config configures a default color scheme for our clusters layer.
    config_file = resource_filename("geocluster", "kepler_config.json")
    with open(config_file) as f:
        map.config = json.loads(f.read())
    filepath = ensure_file_path(dirname, filename)
    map.save_to_html(file_name=str(filepath), center_map=True)
    return filepath
--- a/geoclustering/init.py
+++ b/geoclustering/init.py
--- a/geoclustering/main.py
+++ b/geoclustering/main.py
@@ -0,0 +1,96 @@
 from pathlib import Path
 import click
 import os
 import webbrowser
 import geoclustering.clustering as clustering
 import geoclustering.encoding as encoding
 import geoclustering.io as io
@click.command(
    help="Tool to cluster geolocations. A cluster is created when a certain number of points (defined with --size) each are within a given distance (defined with --distance) of at least one other point in the cluster. Input is supplied as a csv file. At a minimum, each row needs to have a 'lat' and a 'lon' column. Other rows are reflected to the output."
 )
@click.option(
    "--distance",
    "-d",
    type=click.FLOAT,
    required=True,
    help="(in km) Max. distance between two points in a cluster.",
 )
@click.option(
    "--size",
    "-s",
    type=click.INT,
    required=True,
    help="Min. number of points in a cluster.",
 )
@click.option(
    "--output",
    "-o",
    type=click.Path(exists=False),
    default="output",
    help="Output directory for results. Default: ./output",
 )
@click.option(
    "--algorithm",
    "-a",
    type=click.Choice(
        ["dbscan", "optics"],
        case_sensitive=False,
    ),
    default="dbscan",
    help="Clustering algorithm to be used. `optics` produces tighter clusters but is slower. Default: dbscan",
 )
@click.option(
    "--open",
    is_flag=True,
    help="Open the generated visualization in the default browser automatically.",
 )
@click.option("--debug", is_flag=True, help="Print debug output.")
@click.argument("filename", type=click.Path(exists=True))
 def main(distance, size, output, filename, algorithm, open, debug):
    def print_debug(s):
        if debug:
            click.secho(s, fg="bright_black")
    df = io.read_csv_file(filename)
    print_debug(f"Read {len(df)} valid coordinates from {Path(filename).absolute()}")
    clusters = clustering.cluster_locations(
        df=df, algorithm=algorithm, radius_km=distance, min_cluster_size=size
    )
    if not bool(clusters):
        click.secho("Did not find clusters matching input parameters.", fg="yellow")
        return
    print_debug(f"Found {len(clusters)} valid clusters using {algorithm}")
    encoded = encoding.encode_clusters(clusters)
    io.write_output_file(output, "result.txt", encoded["string"])
    io.write_output_file(output, "result.json", encoded["json"])
    io.write_output_file(output, "result.geojson", encoded["geojson"])
    vis = io.write_visualization(output, "result.html", encoded["geojson"])
    if vis is None:
        print_debug(f"Skipped generating visualization: kepler is not installed.")
    click.echo(f"Output files saved to {Path(output).absolute()}")
    if open:
        if vis:
            webbrowser.open_new_tab("file://" + str(vis.absolute()))
            print_debug(f"Opened visualization in default browser.")
        else:
            click.secho(
                "Can't open kepler.gl: package not installed. Please re-install geoclustering with `pip install geoclustering[full]`.",
                fg="yellow",
            )
    click.secho("Clustering completed.", fg="green")
 if __name__ == "__main__":
    main()
--- a/geoclustering/clustering.py
+++ b/geoclustering/clustering.py
@@ -14,8 +14,6 @@ def to_cluster_dict(df, clustering):
    """
    clusters_by_id = {}
    print(clustering.labels_)
    for idx, cluster_id in enumerate(clustering.labels_):
        # ignore "noise" locations that don't belong to any cluster.
        if cluster_id > -1:
--- a/geoclustering/encoding.py
+++ b/geoclustering/encoding.py
--- a/geoclustering/io.py
+++ b/geoclustering/io.py
@@ -0,0 +1,117 @@
 from pathlib import Path
 from pkg_resources import resource_filename
 import json
 import pandas as pd
 import numpy as np
 import os
 import sys
 # kepler is optional, check if installed.
 try:
    from keplergl import KeplerGl
 except:
    has_kepler = False
 else:
    has_kepler = True
 class HiddenPrints:
    """Disables stdout prints for a block of code."""
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, "w")
    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout
 def is_valid_lat(val: str) -> bool:
    """Given a string, check if it corresponds to a valid decimal latitude value"""
    try:
        val = float(val)
        return val >= -90 and val <= 90
    except:
        return False
 def is_valid_lon(val: str) -> bool:
    """Given a string, check if it corresponds to a valid decimal longitude value"""
    try:
        val = float(val)
        return val >= -180 and val <= 180
    except:
        return False
 def is_not_none(val: any) -> bool:
    return val is not None
 def read_csv_file(filename):
    """Read input csv file, dropping rows that don't have valid location data."""
    # replace NaN for all fields not to break kepler parsing.
    df = pd.read_csv(filename).replace({np.nan: None})
    # construct an index of values with valid lat & lon.
    valid_index = df.lat.apply(is_valid_lat) & df.lon.apply(is_valid_lon)
    df_invalid = df[~valid_index]
    if count_invalid := len(df_invalid):
        df_not_empty = df_invalid[
            (df_invalid.lat.apply(is_not_none) | df_invalid.lon.apply(is_not_none))
        ]
        count_not_empty = len(df_not_empty)
        if count_empty := count_invalid - count_not_empty:
            print(f"Removed {count_empty} empty coordinate pairs.")
        if count_not_empty:
            print(f"Removed {count_not_empty} invalid coordinate pairs:")
            print(df_not_empty[["lat", "lon"]].to_string())
    return df[valid_index]
 def ensure_file_path(dirname, filename):
    """Ensure a parent directory exists for a file."""
    path = Path(dirname)
    path.mkdir(parents=True, exist_ok=True)
    return path / filename
 def write_output_file(dirname, filename, data):
    """Write a file, ensuring parent directories."""
    filepath = ensure_file_path(dirname, filename)
    with open(filepath, "w") as f:
        f.write(data)
    return filepath
 def write_visualization(dirname, filename, data):
    """Write a visualization, ensuring parent directories."""
    if not has_kepler:
        return None
    # Hide kepler stdout output.
    with HiddenPrints():
        map = KeplerGl()
    map.add_data(data=data, name="clusters")
    # config configures a default color scheme for our clusters layer.
    config_file = resource_filename("geoclustering", "kepler_config.json")
    with open(config_file) as f:
        map.config = json.loads(f.read())
    filepath = ensure_file_path(dirname, filename)
    # Hide kepler stdout output.
    with HiddenPrints():
        map.save_to_html(file_name=str(filepath), center_map=True)
    return filepath
--- a/geoclustering/kepler_config.json
+++ b/geoclustering/kepler_config.json
@@ -9,7 +9,7 @@
          "config": {
            "dataId": "clusters",
            "label": "clusters",
-            "color": [179, 173, 158],
+            "color": [248, 149, 112],
            "highlightColor": [252, 242, 26, 255],
            "columns": { "geojson": "_geojson" },
            "isVisible": true,
@@ -19,16 +19,30 @@
              "thickness": 0.5,
              "strokeColor": null,
              "colorRange": {
-                "name": "Global Warming",
+                "name": "Uber Viz Qualitative 4",
-                "type": "sequential",
+                "type": "qualitative",
                "category": "Uber",
                "colors": [
-                  "#5A1846",
+                  "#12939A",
-                  "#900C3F",
+                  "#DDB27C",
-                  "#C70039",
+                  "#88572C",
-                  "#E3611C",
+                  "#FF991F",
-                  "#F1920E",
+                  "#F15C17",
-                  "#FFC300"
+                  "#223F9A",
                  "#DA70BF",
                  "#125C77",
                  "#4DC19C",
                  "#776E57",
                  "#17B8BE",
                  "#F6D18A",
                  "#B7885E",
                  "#FFCB99",
                  "#F89570",
                  "#829AE3",
                  "#E79FD5",
                  "#1E96BE",
                  "#89DAC1",
                  "#B3AD9E"
                ]
              },
              "strokeColorRange": {
--- a/geoclustering/version.py
+++ b/geoclustering/version.py
@@ -0,0 +1,11 @@
 _MAJOR = "0"
 _MINOR = "3"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
 _PATCH = "0"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
 VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
 VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)
--- a/pytest.ini
+++ b/pytest.ini
@@ -0,0 +1,3 @@
 [pytest]
 testpaths = tests/
 python_files = *.py
--- a/scripts/release.sh
+++ b/scripts/release.sh
@@ -0,0 +1,18 @@
 #!/bin/bash
 set -e
 TAG=$(python -c 'from geoclustering.version import VERSION; print("v" + VERSION)')
 read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
 if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then
    git add -A
    git commit -m "Bump version to $TAG for release" || true && git push
    echo "Creating new git tag $TAG"
    git tag "$TAG" -m "$TAG"
    git push --tags
 else
    echo "Cancelled"
    exit 1
 fi
--- a/setup.py
+++ b/setup.py
@@ -1,21 +1,41 @@
 from setuptools import setup
 # version.py defines the VERSION and VERSION_SHORT variables.
 # We use exec here so we don't import cached_path whilst setting up.
 VERSION = {}  # type: ignore
 with open("geoclustering/version.py", "r") as version_file:
    exec(version_file.read(), VERSION)
 setup(
-    name="geocluster",
+    name="geoclustering",
-    version="0.1",
+    version=VERSION["VERSION"],
-    description="",
+    description="📍 command-line tool for clustering geolocations.",
    long_description=open("README.md").read(),
    long_description_content_type="text/markdown",
    classifiers=[
        "Intended Audience :: Developers",
        "Intended Audience :: Science/Research",
        "License :: OSI Approved :: MIT License",
        "Programming Language :: Python :: 3",
    ],
    author="Bellingcat",
-    packages=["geocluster"],
+    author_email="tech@bellingcat.com",
-    entry_points={"console_scripts": ["geocluster = geocluster.__main__:main"]},
+    license="MIT",
    packages=["geoclustering"],
    package_data={"geoclustering": ["kepler_config.json"]},
    keywords=["cluster", "gis", "pattern-analysis"],
    entry_points={"console_scripts": ["geoclustering = geoclustering.__main__:main"]},
    install_requires=[
        "click",
        "geojson",
        "keplergl",
        "numpy",
        "pandas",
        "scikit-learn",
    ],
-    extras_require={"dev": ["black", "wheel"]},
+    extras_require={
        "dev": ["black", "wheel", "pre-commit", "pytest"],
        "full": ["keplergl"],
    },
    include_package_data=True,
    zip_safe=False,
 )
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/clustering.py
+++ b/tests/clustering.py
@@ -0,0 +1,42 @@
 from geoclustering.clustering import cluster_locations
 from geoclustering.io import read_csv_file
 from tests.helpers import get_fixture_path, read_fixture_csv
 df = read_fixture_csv("clustering.csv")
 def has_member(list, name):
    return any(x for x in list if x["name"] == name)
 def test_clustering_all():
    # there should be one cluster with all members but Erin.
    res = cluster_locations(
        df=df, algorithm="dbscan", radius_km=1.97, min_cluster_size=4
    )
    assert len(res.values()) == 1
    assert len(res[0]) == 4
 def test_clustering_split():
    res = cluster_locations(
        df=df, algorithm="dbscan", radius_km=0.5, min_cluster_size=2
    )
    # there should be two cluster: Alice & Bob and Carol & Dan
    assert len(res.values()) == 2
    cluster_one = res[0]
    cluster_two = res[1]
    assert len(cluster_one) == 2
    assert has_member(cluster_one, "Alice")
    assert has_member(cluster_one, "Bob")
    assert has_member(cluster_two, "Carol")
    assert has_member(cluster_two, "Dan")
 def test_clustering_none():
    # there should be no clusters now.
    res = cluster_locations(
        df=df, algorithm="dbscan", radius_km=0.5, min_cluster_size=3
    )
    assert len(res.values()) == 0
--- a/tests/encoding.py
+++ b/tests/encoding.py
@@ -0,0 +1,30 @@
 from geoclustering.clustering import cluster_locations
 from geoclustering.encoding import encode_clusters
 from tests.helpers import read_fixture_csv, read_fixture_content
 df = read_fixture_csv("clustering.csv")
 def test_encoders():
    clusters = {
        0: [
            {"id": 1, "name": "Alice", "lat": 52.523955, "lon": 13.442362},
            {"id": 2, "name": "Bob", "lat": 52.526659, "lon": 13.448097},
        ],
        1: [
            {"id": 3, "name": "Carol", "lat": 52.525626, "lon": 13.419246},
            {
                "id": 4,
                "name": "Dan",
                "lat": 52.52443559865125,
                "lon": 13.41261723049818,
            },
        ],
    }
    res = encode_clusters(clusters)
    assert res["string"] == read_fixture_content("snapshots/result.txt")
    assert res["json"] == read_fixture_content("snapshots/result.json")
    assert res["geojson"] == read_fixture_content("snapshots/result.geojson")
--- a/tests/fixtures/clustering.csv
+++ b/tests/fixtures/clustering.csv
@@ -0,0 +1,6 @@
 id,name,lat,lon
 1,Alice,52.523955,13.442362
 2,Bob,52.526659,13.448097
 3,Carol,52.525626,13.419246
 4,Dan,52.52443559865125,13.41261723049818
 5,Erin,52.524838991760774,13.383188597040382
--- a/tests/fixtures/io.csv
+++ b/tests/fixtures/io.csv
@@ -0,0 +1,9 @@
 id,name,lat,lon
 1,Alice,,
 2,,52.523955,13.442362
 ,,-90.12,132.23
 4,,78.234,-180.1212
 5,Bob,52.524838991760774,13.383188597040382
 6,Peter,91.234,
 7,Horst,,23.23
 7,Erin,foo,bar
--- a/tests/fixtures/snapshots/result.geojson
+++ b/tests/fixtures/snapshots/result.geojson
@@ -0,0 +1 @@
 {"type": "FeatureCollection", "features": [{"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.442362, 52.523955]}, "properties": {"id": 1, "name": "Alice", "cluster_id": 0}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.448097, 52.526659]}, "properties": {"id": 2, "name": "Bob", "cluster_id": 0}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.419246, 52.525626]}, "properties": {"id": 3, "name": "Carol", "cluster_id": 1}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.412617, 52.524436]}, "properties": {"id": 4, "name": "Dan", "cluster_id": 1}}]}
--- a/tests/fixtures/snapshots/result.json
+++ b/tests/fixtures/snapshots/result.json
@@ -0,0 +1 @@
 [{"cluster_id": 0, "points": [{"id": 1, "name": "Alice", "lat": 52.523955, "lon": 13.442362}, {"id": 2, "name": "Bob", "lat": 52.526659, "lon": 13.448097}]}, {"cluster_id": 0, "points": [{"id": 1, "name": "Alice", "lat": 52.523955, "lon": 13.442362}, {"id": 2, "name": "Bob", "lat": 52.526659, "lon": 13.448097}]}, {"cluster_id": 1, "points": [{"id": 3, "name": "Carol", "lat": 52.525626, "lon": 13.419246}, {"id": 4, "name": "Dan", "lat": 52.52443559865125, "lon": 13.41261723049818}]}, {"cluster_id": 1, "points": [{"id": 3, "name": "Carol", "lat": 52.525626, "lon": 13.419246}, {"id": 4, "name": "Dan", "lat": 52.52443559865125, "lon": 13.41261723049818}]}]
--- a/tests/fixtures/snapshots/result.txt
+++ b/tests/fixtures/snapshots/result.txt
@@ -0,0 +1,7 @@
 Cluster 0
 id 1, name Alice, lat 52.523955, lon 13.442362
 id 2, name Bob, lat 52.526659, lon 13.448097
 Cluster 1
 id 3, name Carol, lat 52.525626, lon 13.419246
 id 4, name Dan, lat 52.52443559865125, lon 13.41261723049818
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -0,0 +1,16 @@
 import os
 from geoclustering.io import read_csv_file
 def get_fixture_path(filename):
    dir_path = os.path.dirname(os.path.realpath(__file__))
    return os.path.join(dir_path, "fixtures", filename)
 def read_fixture_csv(filename):
    return read_csv_file(get_fixture_path(filename))
 def read_fixture_content(filename):
    with open(get_fixture_path(filename)) as f:
        return f.read()
--- a/tests/io.py
+++ b/tests/io.py
@@ -0,0 +1,25 @@
 from pathlib import Path
 import shutil
 from geoclustering.io import write_output_file
 from tests.helpers import read_fixture_csv
 def test_csv_filters():
    df = read_fixture_csv("io.csv")
    # entries 2 & 5 in fixture are valid.
    assert len(df) == 2
    assert df.iloc[0]["name"] == None
    assert df.iloc[1]["name"] == "Bob"
 def test_write_output_file():
    p = "./this/dir/does/not/exist"
    f = "test.txt"
    write_output_file(p, f, "test")
    path = Path(p) / f
    with open(path) as f:
        assert f.read() == "test"
    shutil.rmtree(Path("./this"))
Author	SHA1	Message	Date
Felix Spöttel	926aaf73d6	Bump version to v0.3.0 for release	2022-07-04 16:43:48 +02:00
Felix Spöttel	6a5cb3c3c3	feat: optional kepler.gl integration (#12 )	2022-07-04 16:22:27 +02:00
Felix Spöttel	d252c6b8f3	test: add test suite (#7 ) * add pre-commit hook * improve logging of inconsistent data	2022-07-04 13:54:07 +02:00
Felix Spöttel	1c5d0f649e	docs: update cli documentation	2022-07-01 19:18:06 +02:00
Felix Spöttel	6ed01417c3	docs: update install section	2022-07-01 18:57:40 +02:00
Felix Spöttel	3cc3c30e03	Bump version to v0.2.1 for release	2022-07-01 18:52:00 +02:00
Felix Spöttel	c9d36c6bf3	feat: print success output	2022-07-01 18:51:25 +02:00
Felix Spöttel	62da0806c7	fix: debug prints	2022-07-01 18:48:17 +02:00
Felix Spöttel	8657bd73ec	Bump version to v0.2.0 for release	2022-07-01 18:15:07 +02:00
Felix Spöttel	e633665813	chore: update license	2022-07-01 18:12:00 +02:00
Felix Spöttel	cff5256d06	feat: add `--debug` flag, improve logging & help closes #9	2022-07-01 17:53:09 +02:00
Felix Spöttel	4dfa08bbbc	feat: add `--open` flag (#11 ) closes #5	2022-07-01 17:08:53 +02:00
Felix Spöttel	eaa4022b70	ci: use pipfile.lock as cache key	2022-07-01 17:05:43 +02:00
Felix Spöttel	1cb5541baa	chore: remove clustering print	2022-07-01 17:04:56 +02:00
Felix Spöttel	b40074317c	feat: extend kepler.gl color range closes #10	2022-07-01 17:04:33 +02:00
Miguel Sozinho Ramalho	f1053953ba	feat: auto-deploy to pypi (#8 )	2022-07-01 15:23:50 +01:00
		`@@ -0,0 +1 @@`
							{"type": "FeatureCollection", "features": [{"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.442362, 52.523955]}, "properties": {"id": 1, "name": "Alice", "cluster_id": 0}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.448097, 52.526659]}, "properties": {"id": 2, "name": "Bob", "cluster_id": 0}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.419246, 52.525626]}, "properties": {"id": 3, "name": "Carol", "cluster_id": 1}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.412617, 52.524436]}, "properties": {"id": 4, "name": "Dan", "cluster_id": 1}}]}
		`@@ -0,0 +1 @@`
							[{"cluster_id": 0, "points": [{"id": 1, "name": "Alice", "lat": 52.523955, "lon": 13.442362}, {"id": 2, "name": "Bob", "lat": 52.526659, "lon": 13.448097}]}, {"cluster_id": 0, "points": [{"id": 1, "name": "Alice", "lat": 52.523955, "lon": 13.442362}, {"id": 2, "name": "Bob", "lat": 52.526659, "lon": 13.448097}]}, {"cluster_id": 1, "points": [{"id": 3, "name": "Carol", "lat": 52.525626, "lon": 13.419246}, {"id": 4, "name": "Dan", "lat": 52.52443559865125, "lon": 13.41261723049818}]}, {"cluster_id": 1, "points": [{"id": 3, "name": "Carol", "lat": 52.525626, "lon": 13.419246}, {"id": 4, "name": "Dan", "lat": 52.52443559865125, "lon": 13.41261723049818}]}]