mirror of
https://hub.njuu.cf/TheAlgorithms/Python.git
synced 2023-10-11 13:06:12 +08:00
Update k_means_clust.py (#8996)
* Update k_means_clust.py * Apply suggestions from code review --------- Co-authored-by: Tianyi Zheng <tianyizheng02@gmail.com>
This commit is contained in:
parent
b2e186f4b7
commit
84ec9414e4
@ -11,10 +11,10 @@ Inputs:
|
||||
- initial_centroids , initial centroid values generated by utility function(mentioned
|
||||
in usage).
|
||||
- maxiter , maximum number of iterations to process.
|
||||
- heterogeneity , empty list that will be filled with hetrogeneity values if passed
|
||||
- heterogeneity , empty list that will be filled with heterogeneity values if passed
|
||||
to kmeans func.
|
||||
Usage:
|
||||
1. define 'k' value, 'X' features array and 'hetrogeneity' empty list
|
||||
1. define 'k' value, 'X' features array and 'heterogeneity' empty list
|
||||
2. create initial_centroids,
|
||||
initial_centroids = get_initial_centroids(
|
||||
X,
|
||||
@ -31,8 +31,8 @@ Usage:
|
||||
record_heterogeneity=heterogeneity,
|
||||
verbose=True # whether to print logs in console or not.(default=False)
|
||||
)
|
||||
4. Plot the loss function, hetrogeneity values for every iteration saved in
|
||||
hetrogeneity list.
|
||||
4. Plot the loss function and heterogeneity values for every iteration saved in
|
||||
heterogeneity list.
|
||||
plot_heterogeneity(
|
||||
heterogeneity,
|
||||
k
|
||||
@ -198,13 +198,10 @@ def report_generator(
|
||||
df: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Function generates easy-erading clustering report. It takes 2 arguments as an input:
|
||||
DataFrame - dataframe with predicted cluester column;
|
||||
FillMissingReport - dictionary of rules how we are going to fill missing
|
||||
values of for final report generate (not included in modeling);
|
||||
in order to run the function following libraries must be imported:
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
Generates a clustering report. This function takes 2 arguments as input:
|
||||
df - dataframe with predicted cluster column
|
||||
fill_missing_report - dictionary of rules on how we are going to fill in missing
|
||||
values for final generated report (not included in modelling);
|
||||
>>> data = pd.DataFrame()
|
||||
>>> data['numbers'] = [1, 2, 3]
|
||||
>>> data['col1'] = [0.5, 2.5, 4.5]
|
||||
@ -306,10 +303,10 @@ def report_generator(
|
||||
a.columns = report.columns # rename columns to match report
|
||||
report = report.drop(
|
||||
report[report.Type == "count"].index
|
||||
) # drop count values except cluster size
|
||||
) # drop count values except for cluster size
|
||||
report = pd.concat(
|
||||
[report, a, clustersize, clusterproportion], axis=0
|
||||
) # concat report with clustert size and nan values
|
||||
) # concat report with cluster size and nan values
|
||||
report["Mark"] = report["Features"].isin(clustering_variables)
|
||||
cols = report.columns.tolist()
|
||||
cols = cols[0:2] + cols[-1:] + cols[2:-1]
|
||||
|
Loading…
Reference in New Issue
Block a user