diff --git a/.gitignore b/.gitignore
index 48809b0..2dd2637 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,5 +51,6 @@ docs/_build/
# Testing notebook
notebooks/testing.ipynb
+notebooks/investigations
*.DS_Store
diff --git a/notebooks/004_connection_check.ipynb b/notebooks/004_connection_check.ipynb
new file mode 100644
index 0000000..8192684
--- /dev/null
+++ b/notebooks/004_connection_check.ipynb
@@ -0,0 +1,363 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "b7641405",
+ "metadata": {},
+ "source": [
+ "*In this tutorial we will investigate two seperate companies and check if they are connected.*"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e39bd44d",
+ "metadata": {},
+ "source": [
+ "There are instances where we may want to see if two companies are connected. We can do this by simply building a network for each company and comparing them to see if there are any common officers, addresses or companies.\n",
+ "\n",
+ "Lets test this approach with two example companies, Zahawi & Zahawi Ltd (07285998) and Gorgeous Services Limited (05714521):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "53435932",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sugartrail\n",
+ "import pandas as pd\n",
+ "sugartrail.api.basic_auth.username = \"\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "489a4141",
+ "metadata": {},
+ "source": [
+ "Create one network for Zahawi & Zahawi including some limits to reduce the number of possibly irrelevant connections:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "300cecde",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "zahawi_connections = sugartrail.base.Network(company_id='07285998')\n",
+ "zahawi_connections.hop.officer_appointments_maxsize = 20\n",
+ "zahawi_connections.hop.officers_at_address_maxsize = 20\n",
+ "zahawi_connections.hop.companies_at_address_maxsize = 20"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bf8ddb84",
+ "metadata": {},
+ "source": [
+ "Create a second network for Gorgeous Services:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "9480e020",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gorgeous_connections = sugartrail.base.Network(company_id='05714521')\n",
+ "gorgeous_connections.hop.officer_appointments_maxsize = 20\n",
+ "gorgeous_connections.hop.officers_at_address_maxsize = 20\n",
+ "gorgeous_connections.hop.companies_at_address_maxsize = 20"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fd678b28",
+ "metadata": {},
+ "source": [
+ "We can now pass both networks to the `find_network_connections` method which returns any connections found between two networks. The method accepts two networks as input and an optional `max_depth` value (defaults to 5) which sets the maximum depth of network we will build for both. `find_network_connections` builds each network up to the `max_depth` value and completes when connections are found or the `max_depth` is reached."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "b4036e3d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1/5 hops completed.\n",
+ "2/5 hops completed.\n",
+ "3/5 hops completed.\n",
+ "Found connection(s)!\n"
+ ]
+ }
+ ],
+ "source": [
+ "connections = sugartrail.processing.find_network_connections(zahawi_connections, gorgeous_connections)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bac64a8e",
+ "metadata": {},
+ "source": [
+ "Looks like a connection was found. We can see by the long string of characters that its an officer ID:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "be034584",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['g8BmvnpH8blqT87i93sgJeowx7I']"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "connections"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6cd89faa",
+ "metadata": {},
+ "source": [
+ "We can now trace the path from Zahawi & Zahawi to this connection:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "9544095a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " title | \n",
+ " depth | \n",
+ " node_type | \n",
+ " id | \n",
+ " link_type | \n",
+ " link | \n",
+ " node_index | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " ZAHAWI & ZAHAWI LTD | \n",
+ " 0 | \n",
+ " Company | \n",
+ " 07285998 | \n",
+ " | \n",
+ " | \n",
+ " a | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Nadhim ZAHAWI | \n",
+ " 1 | \n",
+ " Person | \n",
+ " tKup8kXPh3-jx_5Bs-BkF5XCyPM | \n",
+ " Officer | \n",
+ " a | \n",
+ " b | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " YOUGOV PLC | \n",
+ " 2 | \n",
+ " Company | \n",
+ " 03607311 | \n",
+ " Appointment | \n",
+ " b | \n",
+ " c | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Benjamin William ELLIOT | \n",
+ " 3 | \n",
+ " Person | \n",
+ " g8BmvnpH8blqT87i93sgJeowx7I | \n",
+ " Officer | \n",
+ " c | \n",
+ " d | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " title depth node_type id \\\n",
+ "0 ZAHAWI & ZAHAWI LTD 0 Company 07285998 \n",
+ "1 Nadhim ZAHAWI 1 Person tKup8kXPh3-jx_5Bs-BkF5XCyPM \n",
+ "2 YOUGOV PLC 2 Company 03607311 \n",
+ "3 Benjamin William ELLIOT 3 Person g8BmvnpH8blqT87i93sgJeowx7I \n",
+ "\n",
+ " link_type link node_index \n",
+ "0 a \n",
+ "1 Officer a b \n",
+ "2 Appointment b c \n",
+ "3 Officer c d "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame(zahawi_connections.find_path('g8BmvnpH8blqT87i93sgJeowx7I'))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "613910a7",
+ "metadata": {},
+ "source": [
+ "... and the path from Gorgeous Connections to the connection:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "f810b714",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " title | \n",
+ " depth | \n",
+ " node_type | \n",
+ " id | \n",
+ " link_type | \n",
+ " link | \n",
+ " node_index | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " GORGEOUS SERVICES LIMITED | \n",
+ " 0 | \n",
+ " Company | \n",
+ " 05714521 | \n",
+ " | \n",
+ " | \n",
+ " a | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Benjamin William ELLIOT | \n",
+ " 1 | \n",
+ " Person | \n",
+ " g8BmvnpH8blqT87i93sgJeowx7I | \n",
+ " Officer | \n",
+ " a | \n",
+ " b | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " title depth node_type id \\\n",
+ "0 GORGEOUS SERVICES LIMITED 0 Company 05714521 \n",
+ "1 Benjamin William ELLIOT 1 Person g8BmvnpH8blqT87i93sgJeowx7I \n",
+ "\n",
+ " link_type link node_index \n",
+ "0 a \n",
+ "1 Officer a b "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame(gorgeous_connections.find_path('g8BmvnpH8blqT87i93sgJeowx7I'))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3e6ffa85",
+ "metadata": {},
+ "source": [
+ "Reading both paths tells us how Zahawi & Zahawi connect to Gorgeous Connections."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/sugartrail/base.py b/sugartrail/base.py
index acf2591..8b30399 100644
--- a/sugartrail/base.py
+++ b/sugartrail/base.py
@@ -221,9 +221,9 @@ class Network:
f.close
def load(self, filename):
- """Loads network stored in JSON format from '../assets/networks/'."""
+ """Loads network stored in JSON format."""
if filename:
- f = open(f'../assets/networks/{filename}')
+ f = open(f'{filename}')
network_data = json.load(f)
self.graph = network_data['graph']
self.company_records = network_data['company_records']
@@ -372,7 +372,7 @@ class Network:
path = sugartrail.processing.asciiify_path(path)
return path
- def perform_hop(self, hops, company_data=None):
+ def perform_hop(self, hops, company_data=None, print_progress=True):
"""Gets companies, officers and addresses within n-degrees of seperation
from current nodes, where n is the number of hops."""
hop_history = []
@@ -396,26 +396,29 @@ class Network:
if address not in self.processed_addresses:
self.hop.search_address(self, address, company_data)
self.processed_addresses.append(address)
- IPython.display.clear_output(wait=True)
- print("Hop number: " + str(hop+1))
- print("Processed " + str(i+1) + "/" + str(len(selected_addresses)) + " addresses.")
+ if print_progress:
+ IPython.display.clear_output(wait=True)
+ print("Hop number: " + str(hop+1))
+ print("Processed " + str(i+1) + "/" + str(len(selected_addresses)) + " addresses.")
for j,company in enumerate(selected_companies):
if company not in self.processed_companies:
self.hop.search_company_id(self,company)
self.processed_companies.append(company)
- IPython.display.clear_output(wait=True)
- print("Hop number: " + str(hop+1))
- print("Processed " + str(len(selected_addresses)) + "/" + str(len(selected_addresses)) + " addresses.")
- print("Processed " + str(j+1) + "/" + str(len(selected_companies)) + " companies.")
+ if print_progress:
+ IPython.display.clear_output(wait=True)
+ print("Hop number: " + str(hop+1))
+ print("Processed " + str(len(selected_addresses)) + "/" + str(len(selected_addresses)) + " addresses.")
+ print("Processed " + str(j+1) + "/" + str(len(selected_companies)) + " companies.")
for k,officer in enumerate(selected_officers):
if officer not in self.processed_officers:
self.hop.search_officer_id(self,officer)
self.processed_officers.append(officer)
- IPython.display.clear_output(wait=True)
- print("Hop number: " + str(hop+1))
- print("Processed " + str(len(selected_addresses)) + "/" + str(len(selected_addresses)) + " addresses.")
- print("Processed " + str(len(selected_companies)) + "/" + str(len(selected_companies)) + " companies.")
- print("Processed " + str(k+1) + "/" + str(len(selected_officers)) + " officers.")
+ if print_progress:
+ IPython.display.clear_output(wait=True)
+ print("Hop number: " + str(hop+1))
+ print("Processed " + str(len(selected_addresses)) + "/" + str(len(selected_addresses)) + " addresses.")
+ print("Processed " + str(len(selected_companies)) + "/" + str(len(selected_companies)) + " companies.")
+ print("Processed " + str(k+1) + "/" + str(len(selected_officers)) + " officers.")
self.maxsize_entities = [i for n, i in enumerate(self.maxsize_entities) if i not in self.maxsize_entities[n + 1:]]
self.processed_officers, self.processed_companies, self.processed_addresses = [],[],[]
self.n += 1
diff --git a/sugartrail/hop.py b/sugartrail/hop.py
index 608a2ce..4a4d3a8 100644
--- a/sugartrail/hop.py
+++ b/sugartrail/hop.py
@@ -68,23 +68,24 @@ class Hop:
# get company address history
address_history = sugartrail.processing.build_address_history(company_id)
# network.address_history.extend(address_history)
- for address in address_history:
- if 'address' in address:
- network.address_history.append(address)
- new_address = address['address']
- if new_address not in network.graph:
- network.graph[new_address] = {
- 'depth': network.n+1,
- 'title': new_address,
- 'node_type': "Address",
- 'arcs': []
+ if address_history:
+ for address in address_history:
+ if 'address' in address:
+ network.address_history.append(address)
+ new_address = address['address']
+ if new_address not in network.graph:
+ network.graph[new_address] = {
+ 'depth': network.n+1,
+ 'title': new_address,
+ 'node_type': "Address",
+ 'arcs': []
+ }
+ arc = {
+ 'arc_type': "Historic Address",
+ 'start_node': company_id
}
- arc = {
- 'arc_type': "Historic Address",
- 'start_node': company_id
- }
- if arc not in network.graph[new_address]['arcs'] and network.graph[new_address]['depth'] == network.n+1:
- network.graph[new_address]['arcs'].append(arc)
+ if arc not in network.graph[new_address]['arcs'] and network.graph[new_address]['depth'] == network.n+1:
+ network.graph[new_address]['arcs'].append(arc)
def search_officer_id(self, network, officer_id):
"""Gets officers, companies and addresses connected to input officer
diff --git a/sugartrail/processing.py b/sugartrail/processing.py
index 5e8363f..493cca8 100644
--- a/sugartrail/processing.py
+++ b/sugartrail/processing.py
@@ -110,6 +110,21 @@ def process_address_changes(address_changes):
address_changes['items'][i]['description_values']['new_address'] = address_changes['items'][i-1]['description_values']['old_address']
return address_changes
+def find_network_connections(first_network, second_network, max_depth=5):
+ """Returns a list of nodes connecting ."""
+ hops = 0
+ while hops < max_depth:
+ first_network.perform_hop(1, print_progress=False)
+ second_network.perform_hop(1, print_progress=False)
+ hops += 1
+ print(str(hops) + "/" + str(max_depth) + " hops completed.")
+ connectors = [x for x in list(filter(first_network.graph.__contains__, second_network.graph.keys())) if x]
+ if connectors:
+ print("Found connection(s)!")
+ return connectors
+ print("No connections found.")
+ return
+
def build_address_history(company_id):
"""Returns a list of dicts containing historic addresses for input company
(company_id)."""