Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
282 changes: 282 additions & 0 deletions Scripts/contactConnectivity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
#!/usr/bin/env python
# coding: utf-8

# Working package*********************
# Analysis of multiple businesses in linked to one financial or authorized contact
# This notebook makes multiple analysis. The analysis are splitted into various sections.
# They are splittted connectibity via authorized financial or authorized contact.#
#
# Import usedful packages
from itertools import combinations
import pandas as pd
import numpy as np
import pickle
import string
import csv
import os

# --------------- Helper functions --------------------------------

# Updated 15/08
def load_data(path, sheet=None, ignore_hash=True):
"""Function to load data from an excel file

Args:
path (string): contain the path to the file
sheet (string, optional): Name of sheet to read in a multisheet. Defaults to None.

Returns:
pandas dataframe: The loaded data
"""
if sheet:
xls = pd.ExcelFile(path)
data = pd.read_excel(xls, sheet)
else:
data = pd.read_excel(path)
# Remove n columns are system hash, remove them
if ignore_hash:
data = data[[col for col in data.columns if not '(Do Not Modify)' in col]]

data.dropna(axis=1, how='all', inplace=True)
#Get dataframe with distince columns calues
data = data[[c for c in data.columns if len(data[c].unique()) > 1]]

return data


def select_column(df, colnames):
"""Get dataframe with columns colname name

Args:
df (dataframe): data from which a columns are selected
colnames (list of columns): list of columns

Returns:
dataframe: data with columns colnames
"""

return df[colnames]


def add_column(d, cols='', new_col='ContactName'):
"""Adds a new column to the df by combining two columns in the application

Args:
d (dataframe): Data to which a new colum is to be added
cols (str, optional): Name of new column. Defaults to ''.
new_col (str, optional): Name of new column. Defaults to 'ContactName'.

Returns:
dataframe: The dataframe with new column added
"""
if not cols:
cols = ['First Name', 'Last Name']
df = d.copy()
df[new_col] = df[cols[0]].map(str) + ' ' + df[cols[1]].map(str)

return df


def filter_data(df, col_name, row_vals, select=True):
"""Filter for values of col in row_vals

Args:
df (pd.dataframe): Data to operate on
col_name (str): column name
row_vals (list): list if column values to operate on
select (bool, optional): Decision to select or exclude on row_vals. Defaults to True.

Returns:
data: list of values
"""

if select:
data = df[df[col_name].isin(row_vals)]
else:
data = df[~df[col_name].isin(row_vals)]

return data

def multiple_app(df, group_col, filter_col):
"""Takes a dataframe and column name and indices that satisfy a condition

Args:
df (pd.dataframe): The dataframe tp process
group_col (str or list): column(s) to group by
filter_col (str): column to filter on

Returns:
data: Dataframe of processed data
"""

selected = df.groupby(
group_col, as_index=False
)[filter_col].count().sort_values([filter_col], ascending=False)
filtered = selected[selected[filter_col]>1]
multiple_vals = filtered[group_col].unique()

return multiple_vals

# Putting everything together
def get_apps(df, exclude_status):
"""Putting all the functions together to get the apps

Args:
df (pd.dataframe): dataframe to process
exclude_status (list): List of values to exclude

Returns:
dataframe: the processed applications
"""

data_with_excluded_col = filter_data(df, 'Application Status', exclude_status, select=False)
#print(data_with_excluded_col.head(2))
fin_contact_mask = multiple_app(data_with_excluded_col, 'FinContactName', 'Reference Number')
auth_contact_mask = multiple_app(data_with_excluded_col, 'AuthContactName', 'Reference Number')

# Filter original data for these with excluded cols
app_fin_contact = filter_data(data_with_excluded_col, 'FinContactName', fin_contact_mask)
app_auth_contact = filter_data(data_with_excluded_col, 'AuthContactName', auth_contact_mask)
#combine the data and remove duplicstes
combined_data = pd.concat([app_fin_contact, app_auth_contact]).drop_duplicates('Reference Number')

return combined_data.sort_values(by=['AuthContactName', 'FinContactName'])

#Created 15/08 Tested to get only unique dplicates according to columns
def get_duplicates(df, subset=None, unique=False):
"""Takes a dataframe and a set of column names and return unque duplicates

Args:
df (pd.dataframe): The dataframe tp process
subset (str or list): column(s) to get duplicates on
unique (bool): Defaults to false so all duplicated are returned. If true only unique list of
duplicates are returned

Returns:
duplicates: Dataframe of duplicates
"""
#mask = df.duplicated(subset=subset, keep=False)
if unique:
duplicates = df[df.duplicated(subset=subset, keep=False)].drop_duplicates(subset=subset)
else:
duplicates = df[df.duplicated(subset=subset, keep='first')]

#duplicates = df[df.duplicated(subset=subset, keep=False)].drop_duplicates(subset=subset)

return duplicates.sort_values(by=subset, ascending=False)
#------------- end helpers -----------------------------------------------

def get_non_duplicates(df, subset=None, keep='first'):
"""Takes a dataframe and a set of column names and return unique rows

Args:
df (pd.dataframe): The dataframe tp process
subset (str or list): column(s) to get duplicates on

Returns:
data: Dataframe of processed data
"""

non_duplicates = df.drop_duplicates(subset=subset, keep=keep)

return non_duplicates

def get_non_duplicate_contacts(df, subset=None, keep='first'):
"""Takes a dataframe and a set of column names and return unique rows

Args:
df (pd.dataframe): The dataframe tp process
subset (str or list): column(s) to get duplicates on

Returns:
non_duplicates: Dataframe of processed data
"""

#add column of row counts for nan. This will be used to determine which is upto date
data = df.drop_duplicates()
data['NanCount'] = data.isnull().sum(axis=1)
data.sort_values(by=[' Full Name', 'NanCount'], ascending=False, inplace=True)
#drop column of row counts for nan
non_duplicates = data.drop('NanCount', axis=1).drop_duplicates(subset=subset, keep=keep)

#non_duplicates.sort_values(by=[' Full Name'], ascending=False, inplace=True)

return non_duplicates.reset_index()

# Sept 1, 2020
def get_connectivity(df, rel_col):
"""Takes in a dataframe and columns to groupby

Args:
df (pd.DataFrame): data of applications
rel_col (list of columns): columns to groupby and get the relationship

Yields:
connection, relationship: tuple of connection and pd.dataframe of items conneted to it
"""
drop_rel_cols = [ 'FinContactName', 'AuthContactName']
gby = df.groupby(rel_col)
for connect, frame in gby:
"""
Takes columns of a pd.dataframe and create a tuple of combinations of rows for example
[r1; r2; r3] => [(r1, r2), (r1, r3), (r2, r3)]. If only one item, returns [].
"""
comb = combinations(frame.drop(drop_rel_cols, axis=1).values, 2)
if comb:
for related in comb:
yield connect, related



def combine_account(combined, relationship):
"""Connects account together via some relations

Args:
combined (connection, pd.dataframe): tuple of connectivity and pd.dataframe
of items connected to it
relationship (string): Type of relationship that exist between items in pd.dataframe

Returns:
pd.dataframe: dataframe in which the related items are place on a row with a
relationship tuple added
"""

#Place holder for the relationship
df = pd.DataFrame(columns=[
'Reference Number B1', 'Business Number B1', 'Legal Name B1', 'Operating Name B1',
'Reference Number B2', 'Business Number B2', 'Legal Name B2', 'Operating Name B2',
'Related Contact Names', 'Relationship Types'
])

for contact, connected_business in combined:
conected_account = np.concatenate(
(connected_business[0], connected_business[1], [contact, relationship]),
axis=None
)
row = {key: val for key, val in zip(df.columns, conected_account)}

df = df.append(row, ignore_index=True)

return df

# 03/04/2020
def get_contact_difference(system_file, external_file, on):
"""Takes two pd.dataframe objects and create complement file of external_file. That is data that is in
external_file and not in system_file

Args:
system_file (pd.dataframe): data that is present in the system
external_file (pd.dataframe): external file to be imported
on (list): columns to base the comparison on

Returns:
pd.dataframe: the difference data equivalent to B n A - A for set A and B
"""

common_contacts = pd.merge(system_file[on], external_file, on=on) #Intersection
contacts_to_import = pd.concat(
[common_contacts, external_file], sort=False
).drop_duplicates(keep=False) #Use intersection and original file to get the difference

return contacts_to_import
#------------- end helpers -----------------------------------------------
Loading