mirror of
https://hub.njuu.cf/TheAlgorithms/Python.git
synced 2023-10-11 13:06:12 +08:00
add visualization of k means clustering as excel format (#2104)
* add visualization of kmneas clust as excel format * style changes * style changes * Add doctest and typehint! * style change * Update machine_learning/k_means_clust.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update machine_learning/k_means_clust.py Co-authored-by: Christian Clauss <cclauss@me.com> Co-authored-by: Christian Clauss <cclauss@me.com>
This commit is contained in:
parent
b9e7c891e2
commit
d034add61f
@ -47,12 +47,18 @@ Usage:
|
||||
k
|
||||
)
|
||||
|
||||
5. Have fun..
|
||||
5. Transfers Dataframe into excel format it must have feature called
|
||||
'Clust' with k means clustering numbers in it.
|
||||
|
||||
|
||||
"""
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from matplotlib import pyplot as plt
|
||||
from sklearn.metrics import pairwise_distances
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
TAG = "K-MEANS-CLUST/ "
|
||||
|
||||
@ -202,3 +208,158 @@ if False: # change to true to run this test case.
|
||||
verbose=True,
|
||||
)
|
||||
plot_heterogeneity(heterogeneity, k)
|
||||
|
||||
|
||||
def ReportGenerator(
|
||||
df: pd.DataFrame, ClusteringVariables: np.array, FillMissingReport=None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Function generates easy-erading clustering report. It takes 2 arguments as an input:
|
||||
DataFrame - dataframe with predicted cluester column;
|
||||
FillMissingReport - dictionary of rules how we are going to fill missing
|
||||
values of for final report generate (not included in modeling);
|
||||
in order to run the function following libraries must be imported:
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
>>> data = pd.DataFrame()
|
||||
>>> data['numbers'] = [1, 2, 3]
|
||||
>>> data['col1'] = [0.5, 2.5, 4.5]
|
||||
>>> data['col2'] = [100, 200, 300]
|
||||
>>> data['col3'] = [10, 20, 30]
|
||||
>>> data['Cluster'] = [1, 1, 2]
|
||||
>>> ReportGenerator(data, ['col1', 'col2'], 0)
|
||||
Features Type Mark 1 2
|
||||
0 # of Customers ClusterSize False 2.000000 1.000000
|
||||
1 % of Customers ClusterProportion False 0.666667 0.333333
|
||||
2 col1 mean_with_zeros True 1.500000 4.500000
|
||||
3 col2 mean_with_zeros True 150.000000 300.000000
|
||||
4 numbers mean_with_zeros False 1.500000 3.000000
|
||||
.. ... ... ... ... ...
|
||||
99 dummy 5% False 1.000000 1.000000
|
||||
100 dummy 95% False 1.000000 1.000000
|
||||
101 dummy stdev False 0.000000 NaN
|
||||
102 dummy mode False 1.000000 1.000000
|
||||
103 dummy median False 1.000000 1.000000
|
||||
<BLANKLINE>
|
||||
[104 rows x 5 columns]
|
||||
"""
|
||||
# Fill missing values with given rules
|
||||
if FillMissingReport:
|
||||
df.fillna(value=FillMissingReport, inplace=True)
|
||||
df["dummy"] = 1
|
||||
numeric_cols = df.select_dtypes(np.number).columns
|
||||
report = (
|
||||
df.groupby(["Cluster"])[ # constract report dataframe
|
||||
numeric_cols
|
||||
] # group by cluster number
|
||||
.agg(
|
||||
[
|
||||
("sum", np.sum),
|
||||
("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))),
|
||||
("mean_without_zeros", lambda x: x.replace(0, np.NaN).mean()),
|
||||
(
|
||||
"mean_25-75",
|
||||
lambda x: np.mean(
|
||||
np.nan_to_num(
|
||||
sorted(x)[
|
||||
round((len(x) * 25 / 100)) : round(len(x) * 75 / 100)
|
||||
]
|
||||
)
|
||||
),
|
||||
),
|
||||
("mean_with_na", np.mean),
|
||||
("min", lambda x: x.min()),
|
||||
("5%", lambda x: x.quantile(0.05)),
|
||||
("25%", lambda x: x.quantile(0.25)),
|
||||
("50%", lambda x: x.quantile(0.50)),
|
||||
("75%", lambda x: x.quantile(0.75)),
|
||||
("95%", lambda x: x.quantile(0.95)),
|
||||
("max", lambda x: x.max()),
|
||||
("count", lambda x: x.count()),
|
||||
("stdev", lambda x: x.std()),
|
||||
("mode", lambda x: x.mode()[0]),
|
||||
("median", lambda x: x.median()),
|
||||
("# > 0", lambda x: (x > 0).sum()),
|
||||
]
|
||||
)
|
||||
.T.reset_index()
|
||||
.rename(index=str, columns={"level_0": "Features", "level_1": "Type"})
|
||||
) # rename columns
|
||||
|
||||
clustersize = report[
|
||||
(report["Features"] == "dummy") & (report["Type"] == "count")
|
||||
] # caclulating size of cluster(count of clientID's)
|
||||
clustersize.Type = (
|
||||
"ClusterSize" # rename created cluster df to match report column names
|
||||
)
|
||||
clustersize.Features = "# of Customers"
|
||||
clusterproportion = pd.DataFrame(
|
||||
clustersize.iloc[:, 2:].values
|
||||
/ clustersize.iloc[:, 2:].values.sum() # caclulating proportion of cluster
|
||||
)
|
||||
clusterproportion[
|
||||
"Type"
|
||||
] = "% of Customers" # rename created cluster df to match report column names
|
||||
clusterproportion["Features"] = "ClusterProportion"
|
||||
cols = clusterproportion.columns.tolist()
|
||||
cols = cols[-2:] + cols[:-2]
|
||||
clusterproportion = clusterproportion[cols] # rearrange columns to match report
|
||||
clusterproportion.columns = report.columns
|
||||
a = pd.DataFrame(
|
||||
abs(
|
||||
report[report["Type"] == "count"].iloc[:, 2:].values
|
||||
- clustersize.iloc[:, 2:].values
|
||||
)
|
||||
) # generating df with count of nan values
|
||||
a["Features"] = 0
|
||||
a["Type"] = "# of nan"
|
||||
a.Features = report[
|
||||
report["Type"] == "count"
|
||||
].Features.tolist() # filling values in order to match report
|
||||
cols = a.columns.tolist()
|
||||
cols = cols[-2:] + cols[:-2]
|
||||
a = a[cols] # rearrange columns to match report
|
||||
a.columns = report.columns # rename columns to match report
|
||||
report = report.drop(
|
||||
report[report.Type == "count"].index
|
||||
) # drop count values except cluster size
|
||||
report = pd.concat(
|
||||
[report, a, clustersize, clusterproportion], axis=0
|
||||
) # concat report with clustert size and nan values
|
||||
report["Mark"] = report["Features"].isin(ClusteringVariables)
|
||||
cols = report.columns.tolist()
|
||||
cols = cols[0:2] + cols[-1:] + cols[2:-1]
|
||||
report = report[cols]
|
||||
sorter1 = {
|
||||
"ClusterSize": 9,
|
||||
"ClusterProportion": 8,
|
||||
"mean_with_zeros": 7,
|
||||
"mean_with_na": 6,
|
||||
"max": 5,
|
||||
"50%": 4,
|
||||
"min": 3,
|
||||
"25%": 2,
|
||||
"75%": 1,
|
||||
"# of nan": 0,
|
||||
"# > 0": -1,
|
||||
"sum_with_na": -2,
|
||||
}
|
||||
report = (
|
||||
report.assign(
|
||||
Sorter1=lambda x: x.Type.map(sorter1),
|
||||
Sorter2=lambda x: list(reversed(range(len(x)))),
|
||||
)
|
||||
.sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False)
|
||||
.drop(["Sorter1", "Sorter2"], axis=1)
|
||||
)
|
||||
report.columns.name = ""
|
||||
report = report.reset_index()
|
||||
report.drop(columns=["index"], inplace=True)
|
||||
return report
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
doctest.testmod()
|
||||
|
Loading…
Reference in New Issue
Block a user