jnsofini · jnsofini · Aug 8, 2020 · Sep 1, 2020 · Sep 3, 2020 · Sep 3, 2020
diff --git a/Scripts/contactConnectivity.py b/Scripts/contactConnectivity.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+#  Working package*********************
+# Analysis of multiple businesses in linked to one financial or authorized contact
+# This notebook makes multiple analysis. The analysis are splitted into various sections. 
+# They are splittted connectibity via authorized  financial or authorized contact.# 
+# 
+# Import usedful packages
+from itertools import combinations
+import pandas as pd
+import numpy as np
+import pickle 
+import string
+import csv
+import os
+
+# --------------- Helper functions --------------------------------
+
+# Updated 15/08
+def load_data(path, sheet=None, ignore_hash=True):
+    """Function to load data from an excel file
+
+    Args:
+        path (string): contain the path to the file
+        sheet (string, optional): Name of sheet to read in a multisheet. Defaults to None.
+
+    Returns:
+        pandas dataframe: The loaded data
+    """
+    if sheet:
+        xls = pd.ExcelFile(path)
+        data = pd.read_excel(xls, sheet)
+    else:
+        data = pd.read_excel(path)        
+    # Remove n columns are system hash, remove them
+    if ignore_hash:
+        data = data[[col for col in data.columns if not '(Do Not Modify)' in col]]
+
+    data.dropna(axis=1, how='all', inplace=True)
+    #Get dataframe with distince columns calues
+    data = data[[c for c in data.columns if len(data[c].unique()) > 1]]
+
+    return data 
+
+
+def select_column(df, colnames):
+    """Get dataframe with columns colname name
+
+    Args:
+        df (dataframe): data from which a columns are selected
+        colnames (list of columns): list of columns
+
+    Returns:
+        dataframe: data with columns colnames
+    """
+
+    return df[colnames]
+
+
+def add_column(d, cols='', new_col='ContactName'):
+    """Adds a new column to the df by combining two columns in the application
+
+    Args:
+        d (dataframe): Data to which a new colum is to be added
+        cols (str, optional): Name of new column. Defaults to ''.
+        new_col (str, optional): Name of new column. Defaults to 'ContactName'.
+
+    Returns:
+        dataframe: The dataframe with new column added
+    """
+    if not cols:
+        cols = ['First Name', 'Last Name']
+    df = d.copy()
+    df[new_col] = df[cols[0]].map(str) + ' ' + df[cols[1]].map(str)
+
+    return df
+
+
+def filter_data(df, col_name, row_vals, select=True):
+    """Filter for values of col in row_vals   
+
+    Args:
+        df (pd.dataframe): Data to operate on
+        col_name (str): column name
+        row_vals (list): list if column values to operate on
+        select (bool, optional): Decision to select or exclude on row_vals. Defaults to True.
+
+    Returns:
+        data: list of values
+    """
+
+    if select:
+        data = df[df[col_name].isin(row_vals)]
+    else:
+        data = df[~df[col_name].isin(row_vals)]
+
+    return data
+
+def multiple_app(df, group_col, filter_col):
+    """Takes a dataframe and column name and indices that satisfy a condition
+
+    Args:
+        df (pd.dataframe): The dataframe tp process
+        group_col (str or list): column(s) to group by
+        filter_col (str): column to filter on
+
+    Returns:
+        data: Dataframe of processed data
+    """
+
+    selected = df.groupby(
+        group_col, as_index=False
+    )[filter_col].count().sort_values([filter_col], ascending=False)
+    filtered = selected[selected[filter_col]>1]
+    multiple_vals = filtered[group_col].unique()
+
+    return multiple_vals
+
+# Putting everything together
+def get_apps(df, exclude_status):
+    """Putting all the functions together to get the apps
+
+    Args:
+        df (pd.dataframe): dataframe to process
+        exclude_status (list): List of values to exclude
+
+    Returns:
+        dataframe: the processed applications
+    """
+
+    data_with_excluded_col = filter_data(df, 'Application Status', exclude_status, select=False)
+    #print(data_with_excluded_col.head(2))
+    fin_contact_mask = multiple_app(data_with_excluded_col, 'FinContactName', 'Reference Number')
+    auth_contact_mask = multiple_app(data_with_excluded_col, 'AuthContactName', 'Reference Number')
+
+    # Filter original data for these with excluded cols
+    app_fin_contact = filter_data(data_with_excluded_col, 'FinContactName', fin_contact_mask)
+    app_auth_contact = filter_data(data_with_excluded_col, 'AuthContactName', auth_contact_mask)
+    #combine the data and remove duplicstes
+    combined_data = pd.concat([app_fin_contact, app_auth_contact]).drop_duplicates('Reference Number')
+
+    return combined_data.sort_values(by=['AuthContactName', 'FinContactName'])
+
+#Created 15/08 Tested to get only unique dplicates according to columns
+def get_duplicates(df, subset=None, unique=False): 
+    """Takes a dataframe and a set of column names and return unque duplicates
+
+    Args:
+        df (pd.dataframe): The dataframe tp process
+        subset (str or list): column(s) to get duplicates on
+        unique (bool): Defaults to false so all duplicated are returned. If true only unique list of 
+                       duplicates are returned
+
+    Returns:
+        duplicates: Dataframe of duplicates
+    """
+    #mask = df.duplicated(subset=subset, keep=False)
+    if unique:
+        duplicates = df[df.duplicated(subset=subset, keep=False)].drop_duplicates(subset=subset)
+    else:
+        duplicates = df[df.duplicated(subset=subset, keep='first')]
+
+    #duplicates = df[df.duplicated(subset=subset, keep=False)].drop_duplicates(subset=subset)
+
+    return duplicates.sort_values(by=subset, ascending=False)
+#------------- end helpers -----------------------------------------------
+
+def get_non_duplicates(df, subset=None, keep='first'): 
+    """Takes a dataframe and a set of column names and return unique rows
+
+    Args:
+        df (pd.dataframe): The dataframe tp process
+        subset (str or list): column(s) to get duplicates on
+
+    Returns:
+        data: Dataframe of processed data
+    """
+
+    non_duplicates = df.drop_duplicates(subset=subset, keep=keep)
+
+    return non_duplicates
+
+def get_non_duplicate_contacts(df, subset=None, keep='first'):
+    """Takes a dataframe and a set of column names and return unique rows
+
+    Args:
+        df (pd.dataframe): The dataframe tp process
+        subset (str or list): column(s) to get duplicates on
+
+    Returns:
+        non_duplicates: Dataframe of processed data
+    """
+
+    #add column of row counts for nan. This will be used to determine which is upto date
+    data = df.drop_duplicates()
+    data['NanCount'] = data.isnull().sum(axis=1)
+    data.sort_values(by=[' Full Name', 'NanCount'], ascending=False, inplace=True)
+    #drop column of row counts for nan
+    non_duplicates = data.drop('NanCount', axis=1).drop_duplicates(subset=subset, keep=keep)     
+
+    #non_duplicates.sort_values(by=[' Full Name'], ascending=False, inplace=True)
+
+    return non_duplicates.reset_index()
+
+# Sept 1, 2020
+def get_connectivity(df, rel_col):
+    """Takes in a dataframe and columns to groupby 
+
+    Args:
+        df (pd.DataFrame): data of applications
+        rel_col (list of columns): columns to groupby and get the relationship
+
+    Yields:
+        connection, relationship: tuple of connection and pd.dataframe of items conneted to it
+    """
+    drop_rel_cols = [ 'FinContactName', 'AuthContactName']
+    gby = df.groupby(rel_col)
+    for connect, frame in gby:
+        """ 
+        Takes columns of a pd.dataframe and create a tuple of combinations of rows for example
+        [r1; r2; r3] => [(r1, r2), (r1, r3), (r2, r3)]. If only one item, returns [].
+        """
+        comb = combinations(frame.drop(drop_rel_cols, axis=1).values, 2)
+        if comb: 
+            for related in comb:
+                yield connect, related
+
+
+
+def combine_account(combined, relationship):
+    """Connects account together via some relations
+
+    Args:
+        combined (connection, pd.dataframe): tuple of connectivity and pd.dataframe 
+                of items connected to it
+        relationship (string): Type of relationship that exist between items in pd.dataframe
+
+    Returns:
+        pd.dataframe: dataframe in which the related items are place on a row with a 
+        relationship tuple added
+    """
+
+    #Place holder for the relationship
+    df = pd.DataFrame(columns=[
+        'Reference Number B1', 'Business Number B1', 'Legal Name B1', 'Operating Name B1',
+        'Reference Number B2', 'Business Number B2', 'Legal Name B2', 'Operating Name B2',
+        'Related Contact Names', 'Relationship Types'
+        ])
+
+    for contact, connected_business in combined:
+        conected_account =  np.concatenate(
+            (connected_business[0], connected_business[1], [contact, relationship]),
+            axis=None
+            )
+        row = {key: val for key, val in zip(df.columns, conected_account)}
+
+        df = df.append(row, ignore_index=True)
+
+    return df
+
+# 03/04/2020
+def get_contact_difference(system_file, external_file, on):
+    """Takes two pd.dataframe objects and create complement file of external_file. That is data that is in
+    external_file and not in system_file
+
+    Args:
+        system_file (pd.dataframe): data that is present in the system
+        external_file (pd.dataframe): external file to be imported
+        on (list): columns to base the comparison on
+
+    Returns:
+        pd.dataframe: the difference data equivalent to B n A - A for set A and B
+    """
+
+    common_contacts = pd.merge(system_file[on], external_file, on=on) #Intersection
+    contacts_to_import = pd.concat(
+        [common_contacts, external_file], sort=False
+        ).drop_duplicates(keep=False) #Use intersection and original file to get the difference
+
+    return contacts_to_import
+#------------- end helpers -----------------------------------------------