import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pylab as plt
import plotly.express as px
plt.style.use('ggplot')
from datetime import timedelta
import glob
import os
# Dezimalstellen auf 2 stellen für pandas DataFrames
pd.set_option('display.float_format', '{:.2f}'.format)

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, RocCurveDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (classification_report, roc_auc_score, confusion_matrix, RocCurveDisplay)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from scipy.stats import randint
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd
from sklearn.inspection import permutation_importance
from sklearn.inspection import PartialDependenceDisplay
import matplotlib.ticker as mtick
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.model_selection import StratifiedKFold, cross_val_predict

import plotly.io as pio
pio.renderers.default = "notebook_connected"

import warnings

# Unterdrückt alle zukünftigen Warnungen (z. B. FutureWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

# Einlesen der Kreditkartendaten mit spezifizierten Datentypen:
# 'card_id' und 'disp_id' als Integer, 'type' als kategorische Variable,
# sowie 'issued' als Datum im Format "%y%m%d %H:%M:%S"
credit_card_df = pd.read_csv("xselling_banking_data/card.csv", delimiter=";", dtype={
    "card_id": "int64",
    "disp_id": "int64",
    "type": "category"},
    parse_dates=["issued"],
    date_parser=lambda x: pd.to_datetime(x, format="%y%m%d %H:%M:%S"))

credit_card_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   card_id  892 non-null    int64         
 1   disp_id  892 non-null    int64         
 2   type     892 non-null    category      
 3   issued   892 non-null    datetime64[ns]
dtypes: category(1), datetime64[ns](1), int64(2)
memory usage: 22.0 KB

# Pandas zeigt in head() keine Uhrzeit an, wenn sie 00:00:00 ist – Anzeigeoptimierung
credit_card_df.head()

# Prüfen der Nullwerte
credit_card_df.isna().sum()

card_id    0
disp_id    0
type       0
issued     0
dtype: int64

credit_card_df.describe(include="all")

credit_card_df.nunique()

card_id    892
disp_id    892
type         3
issued     607
dtype: int64

credit_card_df['disp_id'].is_unique and credit_card_df['card_id'].is_unique

True

# Berechnung der Anzahl und Prozentanteile der verschiedenen Kreditkartentypen
type_counts = credit_card_df["type"].value_counts().reset_index()
type_counts.columns = ['type', 'count']

# Prozentuale Anteile der Kreditkartentypen berechnen und auf 1 Dezimalstelle runden
type_counts['percent'] = ((type_counts['count'] / type_counts['count'].sum()) * 100).round(1)

# Balkendiagramm zur Visualisierung der Verteilung der Kreditkartentypen
fig = px.bar(
    type_counts, 
    x='type', 
    y='percent', 
    text_auto='.1f', 
    color='type', 
    hover_data=['count'],
    title='Distribution of Credit Cards by Type',
    labels={'type': 'Type', 'percent': 'Percentage (%)'}
)

# Textbeschriftungen ausserhalb der Balken anzeigen
fig.update_traces(textposition='outside', cliponaxis=False)
fig.show()

# Berechnung der Anzahl ausgegebener Kreditkarten pro Jahr inklusive prozentualer Anteile
yearly_counts = credit_card_df['issued'].dt.year.value_counts().sort_index().reset_index()
yearly_counts.columns = ['year', 'count']  

yearly_counts['percent'] = (yearly_counts['count'] / yearly_counts['count'].sum() * 100).round(1)

# Balkendiagramm zur Darstellung des prozentualen Anteils der ausgegebenen Karten pro Jahr
fig = px.bar(
    yearly_counts,
    x='year',
    y='percent',
    text_auto=True,
    title="Percentage of Cards Issued per Year"
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.update_layout(yaxis_title='Percent (%)', xaxis_title='Year')
fig.show()

#Zeitliche Reihenfolge vs. ID-Reihenfolge
#Ist card_id in etwa mit issued korreliert?

credit_card_df[['card_id', 'issued']].sort_values('issued').head(10)

credit_card_df['issued'].dt.month.value_counts(normalize=True).mul(100).round(1)

issued
12   10.90
11   10.70
10   10.40
9    10.10
7     9.20
8     8.90
6     8.30
5     7.60
1     7.20
4     6.10
3     5.50
2     5.30
Name: proportion, dtype: float64

# Einlesen der Dispositionsdaten mit spezifizierten Datentypen für IDs und Kategorie
disposition_df = pd.read_csv("xselling_banking_data/disp.csv", delimiter=";", dtype={
    "client_id": "int64",
    "account_id": "int64",
    "disp_id": "int64",
    "type": "category"})

disposition_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5369 entries, 0 to 5368
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   disp_id     5369 non-null   int64   
 1   client_id   5369 non-null   int64   
 2   account_id  5369 non-null   int64   
 3   type        5369 non-null   category
dtypes: category(1), int64(3)
memory usage: 131.3 KB

disposition_df.head()

# Prüfen der Nullwerte
disposition_df.isna().sum()

disp_id       0
client_id     0
account_id    0
type          0
dtype: int64

disposition_df.nunique()

disp_id       5369
client_id     5369
account_id    4500
type             2
dtype: int64

disposition_df['disp_id'].is_unique and disposition_df['client_id'].is_unique

True

disposition_df.describe(include="all")

# Berechnung der Anzahl und prozentualen Anteile der verschiedenen Dispositionstypen (Kontozugriffsrollen)
type_counts_disp = disposition_df['type'].value_counts().reset_index(name='count')
type_counts_disp.columns = ['type', 'count'] 
type_counts_disp['percent'] = (type_counts_disp['count'] / type_counts_disp['count'].sum() * 100).round(2)

# Balkendiagramm zur Visualisierung der Verteilung der Kontozugriffsrollen
fig = px.bar(
    type_counts_disp,
    x='type',
    y='percent',
    text_auto=True,
    color='type',
    hover_data={"count": True}
)

fig.update_traces(
    textfont_size=12,
    textangle=0,
    textposition="outside",
    cliponaxis=False
)

fig.update_layout(
    title="Distribution of Account Access Roles",
    xaxis_title="Role",
    yaxis_title="Percentage (%)"
)

fig.show()

# Wie viele disposition-Einträge hat jede client_id?
client_counts = disposition_df['client_id'].value_counts()
client_counts

# Zeige alle client_ids, die mehr als 1 Eintrag haben:
client_counts[client_counts > 1]

Series([], Name: count, dtype: int64)

# Einlesen der Kontodaten mit spezifizierten Datentypen für IDs und Frequenz
# sowie Parsing der Spalte 'date' als Datum im Format "%y%m%d"
accounts_df = pd.read_csv("xselling_banking_data/account.csv", delimiter=";", dtype={
    "account_id": "int64",
    "district_id": "int64",
    "frequency": "category"}, 
    parse_dates=["date"], date_format="%y%m%d")

accounts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   account_id   4500 non-null   int64         
 1   district_id  4500 non-null   int64         
 2   frequency    4500 non-null   category      
 3   date         4500 non-null   datetime64[ns]
dtypes: category(1), datetime64[ns](1), int64(2)
memory usage: 110.0 KB

accounts_df.head()

accounts_df.isna().sum()

account_id     0
district_id    0
frequency      0
date           0
dtype: int64

accounts_df['frequency'] = accounts_df['frequency'].map({
    "POPLATEK MESICNE": "monthly_issuance",
    "POPLATEK TYDNE": "weekly_issuance",
    "POPLATEK PO OBRATU": "issuance_after_transaction"
    
})

accounts_df["frequency"].unique()

['monthly_issuance', 'issuance_after_transaction', 'weekly_issuance']
Categories (3, object): ['monthly_issuance', 'issuance_after_transaction', 'weekly_issuance']

accounts_df.describe(include="all")

accounts_df['account_id'].is_unique

True

accounts_df['district_id'].value_counts(normalize=True).mul(100).round(1)

district_id
1    12.30
70    3.40
74    3.00
54    2.80
64    2.00
      ... 
37    0.80
13    0.80
20    0.80
22    0.80
58    0.70
Name: proportion, Length: 77, dtype: float64

accounts_df['frequency'].value_counts(normalize=True).mul(100).round(1)

frequency
monthly_issuance             92.60
weekly_issuance               5.30
issuance_after_transaction    2.10
Name: proportion, dtype: float64

# Berechnung der Anzahl und prozentualen Anteile der verschiedenen Frequenzwerte in den Kontodaten
type_counts_account = accounts_df['frequency'].value_counts().reset_index(name='count')
type_counts_account.columns = ['frequency', 'count'] 
type_counts_account['percent'] = (type_counts_account['count'] / type_counts_account['count'].sum() * 100).round(2)

# Balkendiagramm zur Darstellung der Verteilung der Frequenzen
fig = px.bar(
    type_counts_account,
    x='frequency',
    y='percent',
    text_auto=True,
    color='frequency',
    hover_data={"count": True}
)

fig.update_traces(
    textfont_size=12,
    textangle=0,
    textposition="outside",
    cliponaxis=False
)

fig.update_layout(
    title="Distribution of Frequencies",
    xaxis_title="Frequency",
    yaxis_title="Percentage (%)"
)

fig.show()

accounts_df[['account_id', 'date']].sort_values('date').head(10)

# Einlesen der Bezirksdaten (ohne spezielle Datentypangaben)
district_df = pd.read_csv(
    "xselling_banking_data/district.csv",
    sep=";"
)

district_df.head()

district_df.columns = [
    'district_id', 
    'district_name', 
    'region', 
    'n_inhabitants', 
    'n_municipals_lower_499', 
    'n_municipals_between_500_1999', 
    'n_municipals_between_2000_9999', 
    'n_municipals_higher_10000', 
    'n_cities', 
    'ratio_urban_inhabitants', 
    'avg_salary', 
    'unemployment_rate_1995', 
    'unemployment_rate_1996', 
    'n_enterpreneurs_per_1k_inhabitants', 
    'n_crimes_1995', 
    'n_crimes_1996'
    ]

district_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   district_id                         77 non-null     int64  
 1   district_name                       77 non-null     object 
 2   region                              77 non-null     object 
 3   n_inhabitants                       77 non-null     int64  
 4   n_municipals_lower_499              77 non-null     int64  
 5   n_municipals_between_500_1999       77 non-null     int64  
 6   n_municipals_between_2000_9999      77 non-null     int64  
 7   n_municipals_higher_10000           77 non-null     int64  
 8   n_cities                            77 non-null     int64  
 9   ratio_urban_inhabitants             77 non-null     float64
 10  avg_salary                          77 non-null     int64  
 11  unemployment_rate_1995              77 non-null     object 
 12  unemployment_rate_1996              77 non-null     float64
 13  n_enterpreneurs_per_1k_inhabitants  77 non-null     int64  
 14  n_crimes_1995                       77 non-null     object 
 15  n_crimes_1996                       77 non-null     int64  
dtypes: float64(2), int64(10), object(4)
memory usage: 9.8+ KB

district_df["n_crimes_1995"].unique()

array(['85677', '2159', '2824', '5244', '2616', '2640', '4289', '5179',
       '2987', '3810', '3475', '3804', '1597', '6604', '1845', '1874',
       '1003', '1740', '999', '1563', '2299', '1089', '2879', '5198',
       '1822', '6041', '1029', '1580', '818', '2985', '1328', '4340',
       '4650', '5323', '3384', '5796', '4147', '2653', '4947', '6949',
       '6445', '1658', '4085', '2166', '2080', '2854', '6079', '1655',
       '1660', '2123', '3496', '2564', '1850', '18721', '3659', '3729',
       '2212', '2595', '1879', '2112', '2719', '1562', '4484', '2157',
       '2247', '3244', '5623', '?', '9878', '4980', '9672', '4355',
       '18782', '4063', '3736', '3460'], dtype=object)

district_df[district_df == "?"].count()

district_id                           0
district_name                         0
region                                0
n_inhabitants                         0
n_municipals_lower_499                0
n_municipals_between_500_1999         0
n_municipals_between_2000_9999        0
n_municipals_higher_10000             0
n_cities                              0
ratio_urban_inhabitants               0
avg_salary                            0
unemployment_rate_1995                1
unemployment_rate_1996                0
n_enterpreneurs_per_1k_inhabitants    0
n_crimes_1995                         1
n_crimes_1996                         0
dtype: int64

district_df['district_id'].count()

np.int64(77)

# Fehlende Werte in 'unemployment_rate_1995' und 'n_crimes_1995' durch NaN ersetzen
district_df['unemployment_rate_1995'] = district_df['unemployment_rate_1995'].replace("?", np.nan)
district_df['n_crimes_1995'] = district_df['n_crimes_1995'].replace("?", np.nan)

# Umwandlung der Spalten in numerische Datentypen
district_df['unemployment_rate_1995'] = district_df['unemployment_rate_1995'].apply(pd.to_numeric)
district_df['n_crimes_1995'] = district_df['n_crimes_1995'].apply(pd.to_numeric)

# Umwandlung der Spalten 'district_name' und 'region' in den String-Datentyp
district_df['district_name'] = district_df['district_name'].astype('string')
district_df['region'] = district_df['region'].astype('string')

district_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   district_id                         77 non-null     int64  
 1   district_name                       77 non-null     string 
 2   region                              77 non-null     string 
 3   n_inhabitants                       77 non-null     int64  
 4   n_municipals_lower_499              77 non-null     int64  
 5   n_municipals_between_500_1999       77 non-null     int64  
 6   n_municipals_between_2000_9999      77 non-null     int64  
 7   n_municipals_higher_10000           77 non-null     int64  
 8   n_cities                            77 non-null     int64  
 9   ratio_urban_inhabitants             77 non-null     float64
 10  avg_salary                          77 non-null     int64  
 11  unemployment_rate_1995              76 non-null     float64
 12  unemployment_rate_1996              77 non-null     float64
 13  n_enterpreneurs_per_1k_inhabitants  77 non-null     int64  
 14  n_crimes_1995                       76 non-null     float64
 15  n_crimes_1996                       77 non-null     int64  
dtypes: float64(4), int64(10), string(2)
memory usage: 9.8 KB

district_df.describe(include="all")

district_df.nunique()

district_id                           77
district_name                         77
region                                 8
n_inhabitants                         77
n_municipals_lower_499                53
n_municipals_between_500_1999         36
n_municipals_between_2000_9999        17
n_municipals_higher_10000              6
n_cities                              11
ratio_urban_inhabitants               70
avg_salary                            76
unemployment_rate_1995                70
unemployment_rate_1996                73
n_enterpreneurs_per_1k_inhabitants    44
n_crimes_1995                         75
n_crimes_1996                         76
dtype: int64

district_df.head()

# Berechnung der Anzahl und prozentualen Anteile der Bezirke pro Region
type_counts = district_df["region"].value_counts().reset_index()
type_counts.columns = ['region', 'count']

type_counts['percent'] = ((type_counts['count'] / type_counts['count'].sum()) * 100).round(1)

# Balkendiagramm zur Verteilung der Bezirke auf die Regionen
fig = px.bar(
    type_counts, 
    x='region', 
    y='percent', 
    text_auto='.1f', 
    color='region', 
    hover_data=['count'],
    title='Distribution of Districts to Regions',
    labels={'region': 'Region', 'percent': 'Percentage (%)'}
)

fig.update_traces(textposition='outside', cliponaxis=False)
fig.show()

# Summe der Einwohner pro Region berechnen und prozentualen Anteil an Gesamtbevölkerung ermitteln
region_population = district_df.groupby('region')['n_inhabitants'].sum().reset_index()
region_population['inhabitant_percentage'] = (region_population['n_inhabitants'] / region_population['n_inhabitants'].sum()) * 100

# Sortierung der Regionen nach Bevölkerungsanteil (absteigend)
region_population = region_population.sort_values(by='inhabitant_percentage', ascending=False)

# Balkendiagramm zur Darstellung des Bevölkerungsanteils je Region
fig = px.bar(
    region_population, 
    x='region', 
    y='inhabitant_percentage', 
    title='Share of Total Population by Region (%)',
    labels={'region': 'Region', 'inhabitant_percentage': 'Population Share (%)'},
    text_auto='.2f',
    color='region'
)

fig.update_traces(textposition='outside', cliponaxis=False)

fig.show()

# Auswahl der Spalten zu Gemeinden verschiedener Grössenklassen je Region
municipals_df = district_df[['region', 'n_municipals_lower_499', 'n_municipals_between_500_1999', 
                             'n_municipals_between_2000_9999', 'n_municipals_higher_10000']]

# Umwandlung in ein langes Format für gestapeltes Balkendiagramm
municipals_df_long = municipals_df.melt(id_vars='region', 
                                         value_vars=['n_municipals_lower_499', 'n_municipals_between_500_1999', 
                                                     'n_municipals_between_2000_9999', 'n_municipals_higher_10000'],
                                         var_name='municipal_type', value_name='municipal_count')

# Berechnung des prozentualen Anteils der Gemeinden je Region
municipals_df_long['percent'] = municipals_df_long.groupby('region')['municipal_count'].transform(lambda x: x / x.sum() * 100)

# Gestapeltes Balkendiagramm zur Verteilung der Gemeindegrössenklassen je Region
fig = px.bar(
    municipals_df_long, 
    x='region', 
    y='percent', 
    color='municipal_type', 
    title='Percentage Distribution of Municipal Counts by Region',
    labels={'percent': 'Percentage (%)', 'region': 'Region', 'municipal_type': 'Municipal Type'},
    barmode='stack'
)

# Layout-Anpassungen
fig.update_layout(
    xaxis_title='Region',
    yaxis_title='Percentage (%)',
    xaxis=dict(tickangle=45)
)

fig.show()

# Summe der Städte pro Region berechnen und absteigend sortieren
region_cities = district_df.groupby('region')['n_cities'].sum().reset_index()
region_cities = region_cities.sort_values(by='n_cities', ascending=False)

# Balkendiagramm zur Darstellung der Gesamtzahl der Städte je Region
fig = px.bar(
    region_cities, 
    x='region', 
    y='n_cities', 
    title='Total Number of Cities by Region',
    labels={'region': 'Region', 'n_cities': 'Total Number of Cities'},
    color='region',
    text_auto=True
)

fig.update_traces(textposition='outside', cliponaxis=False)

fig.show()

# Durchschnittliches Gehalt pro Region berechnen und absteigend sortieren
region_avg_salary = district_df.groupby('region')['avg_salary'].mean().reset_index()
region_avg_salary = region_avg_salary.sort_values(by='avg_salary', ascending=False)

# Balkendiagramm zur Darstellung des durchschnittlichen Gehalts je Region
fig = px.bar(
    region_avg_salary, 
    x='region', 
    y='avg_salary', 
    title='Average Salary by Region',
    labels={'region': 'Region', 'avg_salary': 'Average Salary'},
    color='region',
    text_auto='.1f'
)

fig.update_traces(textposition='outside', cliponaxis=False)

fig.show()

# Durchschnittliche Anzahl Unternehmer pro 1.000 Einwohner je Region berechnen
region_n_enterpreneurs_per_1k = district_df.groupby('region')['n_enterpreneurs_per_1k_inhabitants'].mean().reset_index()

# Prozentualer Anteil an der Gesamtzahl der Unternehmer pro 1.000 Einwohner berechnen
region_n_enterpreneurs_per_1k['n_enterpreneurs_percentage'] = (
    region_n_enterpreneurs_per_1k['n_enterpreneurs_per_1k_inhabitants'] /
    region_n_enterpreneurs_per_1k['n_enterpreneurs_per_1k_inhabitants'].sum()
) * 100

# Sortierung nach der durchschnittlichen Unternehmerzahl pro 1.000 Einwohner (absteigend)
region_n_enterpreneurs_per_1k = region_n_enterpreneurs_per_1k.sort_values(by='n_enterpreneurs_per_1k_inhabitants', ascending=False)

# Balkendiagramm zur Darstellung des Anteils der Unternehmer je Region
fig = px.bar(
    region_n_enterpreneurs_per_1k, 
    x='region', 
    y='n_enterpreneurs_percentage', 
    title='Share of Total Entrepreneurs per 1,000 Inhabitants by Region (%)',
    labels={
        'region': 'Region',
        'n_enterpreneurs_percentage': 'Entrepreneur Share (%)'
    },
    color='region',
    text_auto='.2f'
)

fig.update_traces(textposition='outside', cliponaxis=False)

fig.show()

# Durchschnittliche Arbeitslosenquoten 1995 und 1996 je Region berechnen
region_unemployment_rate = district_df.groupby('region')[['unemployment_rate_1995', 'unemployment_rate_1996']].mean().reset_index()

# Gruppiertes Balkendiagramm zur Darstellung der Arbeitslosenquoten je Region für 1995 und 1996
fig = px.bar(
    region_unemployment_rate, 
    x='region', 
    y=['unemployment_rate_1995', 'unemployment_rate_1996'], 
    title='Unemployment Rates 1995 and 1996 by Region',
    labels={'region': 'Region', 'value': 'Unemployment Rate'},
    barmode='group',
    text_auto='.2f'
)

fig.update_traces(textposition='outside', cliponaxis=False)

fig.show()

# Summe der Straftaten 1995 und 1996 je Region berechnen
region_n_crimes = district_df.groupby('region')[['n_crimes_1995', 'n_crimes_1996']].sum().reset_index()

# Gruppiertes Balkendiagramm zur Darstellung der Anzahl der Straftaten je Region für 1995 und 1996
fig = px.bar(
    region_n_crimes, 
    x='region', 
    y=['n_crimes_1995', 'n_crimes_1996'], 
    title='Number of Crimes 1995 and 1996 by Region',
    labels={'region': 'Region', 'value': 'Number of Crimes'},
    barmode='group',
    text_auto='.0f'
)

fig.update_traces(textposition='outside', cliponaxis=False)

fig.show()

# Summe der Straftaten und Einwohner je Region berechnen
region_n_crimes = district_df.groupby('region')[
    ['n_crimes_1995', 'n_crimes_1996', 'n_inhabitants']
].sum().reset_index()

# Berechnung der Anzahl der Straftaten pro 1000 Einwohner für 1995 und 1996
region_n_crimes['n_crimes_1995_per_1000'] = (region_n_crimes['n_crimes_1995'] / region_n_crimes['n_inhabitants']) * 1000
region_n_crimes['n_crimes_1996_per_1000'] = (region_n_crimes['n_crimes_1996'] / region_n_crimes['n_inhabitants']) * 1000

# Gruppiertes Balkendiagramm zur Darstellung der Straftaten pro 1000 Einwohner je Region
fig = px.bar(
    region_n_crimes,
    x='region',
    y=['n_crimes_1995_per_1000', 'n_crimes_1996_per_1000'],
    title='Number of Crimes per 1000 Inhabitants (1995 and 1996) by Region',
    labels={'region': 'Region', 'value': 'Crimes per 1000 Inhabitants'},
    barmode='group',
    text_auto='.2f'
)

fig.update_traces(textposition='outside', cliponaxis=False)
fig.show()

# Einlesen der Kundendaten mit spezifizierten Datentypen
# Hinweis: 'birth_number' wird als String eingelesen, um weitere Verarbeitung zu ermöglichen
client_df = pd.read_csv(
    "xselling_banking_data/client.csv",
    sep=";",
    dtype={
        "client_id": "int64",
        "district_id": "int64",
        "birth_number": "string",
    }
)

client_df.head()

#the number is in the form YYMMDD for men,
#the number is in the form YYMM+50DD for women,
#where YYMMDD is the date of birth

# Extrahieren von Jahr, Monat und Tag aus 'birth_number'
# Für Frauen wird zum Monat 50 addiert, daher Korrektur durch Abzug von 50, falls Monat > 12
year = client_df["birth_number"].str.slice(0, 2)
month_true = client_df["birth_number"].str.slice(2, 4).astype(int).apply(lambda x: x - 50 if x > 12 else x)
day = client_df["birth_number"].str.slice(4, 6)

# Geschlecht basierend auf Monatsteil bestimmen: Monat > 12 -> Frau (F), sonst Mann (M)
client_df["gender"] = client_df["birth_number"].str.slice(2, 4).astype(int).apply(lambda x: "F" if x > 12 else "M")

# Monat mit führender Null auffüllen und zusammenfügen
month = month_true.astype(str).str.zfill(2)
birth_fixed = year + month + day

# Ausgabe der originalen und korrigierten Geburtsnummern der ersten fünf Kunden
print(pd.DataFrame({
    "original": client_df["birth_number"].head(),
    "fixed": birth_fixed.head()
}))

  original   fixed
0   706213  701213
1   450204  450204
2   406009  401009
3   561201  561201
4   605703  600703

# Funktion zur Korrektur des Jahres (Jahr wird mit "19" ergänzt)
def correct_year(year):
    return "19" + year

# Korrigierte Geburtsnummer mit vollständigem Jahr (z.B. "19930115")
birth_fixed_corrected = birth_fixed.apply(lambda x: correct_year(x[:2]) + x[2:])

# Umwandlung in datetime-Format, fehlerhafte Werte werden als NaT gesetzt
client_df["birth_number"] = pd.to_datetime(birth_fixed_corrected, format="%Y%m%d", errors="coerce")

# Ausgabe der ersten Zeilen zur Kontrolle
print(client_df.head())

   client_id birth_number  district_id gender
0          1   1970-12-13           18      F
1          2   1945-02-04            1      M
2          3   1940-10-09            1      F
3          4   1956-12-01            5      M
4          5   1960-07-03            5      F

# Maximaldatum aus bereits geladenen DataFrames
max_accounts_date = accounts_df['date'].max()
max_card_date = credit_card_df['issued'].max()

# Nur Datumsspalte aus trans.csv einlesen
trans_date = pd.read_csv("xselling_banking_data/trans.csv", delimiter=";", usecols=["date"], parse_dates=["date"])
max_trans_date = trans_date["date"].max()

# Nur Datumsspalte aus loan.csv einlesen
loan_date = pd.read_csv("xselling_banking_data/loan.csv", delimiter=";", usecols=["date"], parse_dates=["date"])
max_loan_date = loan_date["date"].max()

# Gesamtmaximaldatum bestimmen
latest_date = max(max_accounts_date, max_card_date, max_trans_date, max_loan_date)

# Alter berechnen
client_df['birth_number'] = pd.to_datetime(client_df['birth_number'], errors='coerce')
client_df['age'] = (latest_date - client_df['birth_number']).dt.days // 365

C:\Users\grego\AppData\Local\Temp\ipykernel_23588\2441548511.py:6: UserWarning:

Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.

C:\Users\grego\AppData\Local\Temp\ipykernel_23588\2441548511.py:10: UserWarning:

Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.

client_df.describe(include='all')

# Gruppierung der Kunden nach Alter und Geschlecht mit Zählung der Personen pro Gruppe
year_gender_counts = client_df.groupby(['age', 'gender']).size().reset_index(name='count')

# Horizontales Balkendiagramm zur Verteilung des Alters nach Geschlecht
fig = px.bar(
    year_gender_counts, 
    x='count', 
    y='age', 
    color='gender', 
    orientation='h',
    title='Distribution of Age by Gender',
    labels={'age': 'Age', 'count': 'Count'},
    color_discrete_map={"M": "blue", "F": "red"},
    height=600
)

# Negative Werte für Männer (erste Gruppe) zum Plotten auf der linken Seite
fig.data[0].x = -fig.data[0].x

# Balkenkontur mit schwarzer Linie für bessere Abgrenzung
fig.update_traces(
    marker=dict(line=dict(width=2, color='black')) 
)

# Keine Textbeschriftung auf den Balken
fig.update_traces(textposition='none')

# Layout-Anpassungen für Achsentitel, Legende und Tick-Intervalle
fig.update_layout(
    xaxis_title='Count',
    yaxis_title='Age',
    showlegend=True, 
    legend_title='Gender',  
    xaxis=dict(tickmode='linear', dtick=10),
)

fig.show()

# Kundenanzahl pro Bezirk gruppieren und Prozentanteil berechnen
clients_per_district = client_df.groupby('district_id').size().reset_index(name='count')
clients_per_district['percentage'] = (clients_per_district['count'] / clients_per_district['count'].sum() * 100).round(1)

clients_per_district['district_id'] = clients_per_district['district_id'].astype(str)
clients_per_district_sorted = clients_per_district.sort_values(by='count', ascending=False)

fig = px.bar(
    clients_per_district_sorted, 
    x='district_id', 
    y='count',
    title='Anzahl der Kunden pro Bezirk',
    labels={'district_id': 'Bezirk', 'count': 'Anzahl der Kunden'},
    color='percentage'
)

fig.show()

# Daueraufträge einlesen mit spezifizierten Datentypen
order_df = pd.read_csv(
    "xselling_banking_data/order.csv",
    sep=";",
    dtype={
        "order_id": "int64",
        "account_id": "int64",
        "bank_to": "string",
        "account_to": "string",
        "amount": "float64",
        "k_symbol": "category"
    }
)

order_df.dtypes

order_id               int64
account_id             int64
bank_to       string[python]
account_to    string[python]
amount               float64
k_symbol            category
dtype: object

order_df.head()

order_df.isna().sum()

order_id      0
account_id    0
bank_to       0
account_to    0
amount        0
k_symbol      0
dtype: int64

order_df['order_id'].is_unique

True

order_df['k_symbol'] = order_df['k_symbol'].map({
    "POJISTNE": "insurance_payment",
    "SIPO": "household_payment",
    "LEASING": "leasing_payment",
    "UVER" : "loan_payment"
    
})

order_df.describe(include='all')

# Berechnung der Häufigkeiten und Prozentanteile der Kategorien in 'k_symbol'
type_counts_k_symbol = order_df['k_symbol'].value_counts().reset_index(name='count')
type_counts_k_symbol.columns = ['k_symbol', 'count']
type_counts_k_symbol = type_counts_k_symbol.sort_values('k_symbol')
type_counts_k_symbol['percent'] = (type_counts_k_symbol['count'] / type_counts_k_symbol['count'].sum() * 100).round(2)

# Balkendiagramm der Verteilung der 'k_symbol' Kategorien
fig = px.bar(
    type_counts_k_symbol,
    x='k_symbol',
    y='percent',
    text_auto=True,
    hover_data={"count": True},
    color='k_symbol'
)

fig.update_layout(
    xaxis_title='K Symbol',
    yaxis_title='Percent',
    xaxis=dict(type='category')
)

fig.update_traces(
    textfont_size=12,
    textangle=0,
    textposition="outside",
    cliponaxis=False
)

fig.show()

# Subplot mit Boxplot und Histogramm zur Verteilung der Auftragsbeträge erstellen
fig = make_subplots(rows=2, cols=1,
                    shared_xaxes=True,
                    vertical_spacing=0.02)

# Boxplot oben
fig.add_trace(go.Box(x=order_df["amount"], name="Boxplot", marker_color="blue", orientation='h'), row=1, col=1)

# Histogramm unten
fig.add_trace(go.Histogram(x=order_df["amount"], nbinsx=100, name="Histogramm", marker_color="blue"), row=2, col=1)

# Layout konfigurieren
fig.update_layout(
    height=600,
    title_text="Distribution of Order amount",
    showlegend=True,
    xaxis_title="Order Amount (CZK)",
    xaxis2_title="Order Amount (CZK)",
    bargap=0.05
)

fig.show()

# Einlesen der Darlehensdaten mit spezifizierten Datentypen und Datumsformat
loan_df = pd.read_csv(
    "xselling_banking_data/loan.csv",
    sep=";",
    dtype={
        "loan_id": "int64",
        "account_id": "int64",
        "amount": "float64",
        "duration": "int64",
        "payments": "float64",
        "status": "category"
    },
    parse_dates=["date"],
    date_parser=lambda x: pd.to_datetime(x, format="%y%m%d"))

loan_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682 entries, 0 to 681
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   loan_id     682 non-null    int64         
 1   account_id  682 non-null    int64         
 2   date        682 non-null    datetime64[ns]
 3   amount      682 non-null    float64       
 4   duration    682 non-null    int64         
 5   payments    682 non-null    float64       
 6   status      682 non-null    category      
dtypes: category(1), datetime64[ns](1), float64(2), int64(3)
memory usage: 33.0 KB

loan_df.head()

loan_df.isna().sum()

loan_id       0
account_id    0
date          0
amount        0
duration      0
payments      0
status        0
dtype: int64

loan_df['loan_id'].is_unique

True

loan_df['status'] = loan_df['status'].map({
    "A" : "finished_ok",
    "B" : "finished_debts",
    "C" : "running_ok",
    "D" : "running_debts"
    
})

loan_df.describe(include='all')

# Subplot mit Boxplot und Histogramm zur Verteilung der Darlehensbeträge erstellen
fig = make_subplots(rows=2, cols=1,
                    shared_xaxes=True,
                    vertical_spacing=0.02)

# Boxplot oben
fig.add_trace(go.Box(x=loan_df["amount"], name="Boxplot", marker_color="red", orientation='h'), row=1, col=1)

# Histogramm unten
fig.add_trace(go.Histogram(x=loan_df["amount"], nbinsx=100, name="Histogramm", marker_color="red"), row=2, col=1)

# Layout konfigurieren
fig.update_layout(
    height=600,
    title_text="Distribution of Loan Amounts",
    showlegend=True,
    xaxis_title="Loan Amount (CZK)",
    xaxis2_title="Loan Amount (CZK)",
    bargap=0.05
)

fig.show()

# Häufigkeiten und Prozentanteile der Darlehenslaufzeiten berechnen und sortieren
type_counts_duration = loan_df['duration'].value_counts().reset_index(name='count')
type_counts_duration.columns = ['duration', 'count']
type_counts_duration = type_counts_duration.sort_values('duration')
type_counts_duration['percent'] = (type_counts_duration['count'] / type_counts_duration['count'].sum() * 100).round(2)
type_counts_duration['duration'] = type_counts_duration['duration'].astype(str)

# Balkendiagramm zur Verteilung der Darlehenslaufzeiten
fig = px.bar(
    type_counts_duration,
    x='duration',
    y='percent',
    text_auto=True,
    hover_data={"count": True},
    color='duration'
)

fig.update_layout(
    xaxis_title='Loan duration',
    yaxis_title='Percent %',
    xaxis=dict(type='category')
)

fig.update_traces(
    textfont_size=12,
    textangle=0,
    textposition="outside",
    cliponaxis=False
)

fig.show()

# Subplot mit Boxplot und Histogramm zur Verteilung der Darlehenszahlungen erstellen
fig = make_subplots(rows=2, cols=1,
                    shared_xaxes=True,
                    vertical_spacing=0.02)

# Boxplot oben
fig.add_trace(go.Box(x=loan_df["payments"], name="Boxplot", marker_color="blue", orientation='h'), row=1, col=1)

# Histogramm unten
fig.add_trace(go.Histogram(x=loan_df["amount"], nbinsx=100, name="Histogramm", marker_color="blue"), row=2, col=1)

# Layout konfigurieren
fig.update_layout(
    height=600,
    title_text="Distribution of Loan Payments",
    showlegend=True,
    xaxis_title="Loan Payments (CZK)",
    xaxis2_title="Loan Payments (CZK)",
    bargap=0.05
)

fig.show()

# Häufigkeiten und Prozentanteile der Darlehensstatus berechnen und sortieren
type_counts_status = loan_df['status'].value_counts().reset_index(name='count')
type_counts_status.columns = ['status', 'count']
type_counts_status = type_counts_status.sort_values('status')  # alphabetische Sortierung
type_counts_status['percent'] = (type_counts_status['count'] / type_counts_status['count'].sum() * 100).round(2)

# Balkendiagramm der Verteilung der Darlehensstatus
fig = px.bar(
    type_counts_status,
    x='status',
    y='percent',
    text_auto=True,
    hover_data={"count": True},
    color='status'
)

fig.update_layout(
    xaxis_title='Loan status',
    yaxis_title='Percent',
    xaxis=dict(type='category')
)

fig.update_traces(
    textfont_size=12,
    textangle=0,
    textposition="outside",
    cliponaxis=False
)

fig.show()

# Einlesen der Transaktionsdaten mit spezifizierten Datentypen und Datumsformat
transaction_df = pd.read_csv("xselling_banking_data/trans.csv", delimiter=";", dtype={
    "trans_id": "int64",
    "account_id": "int64",
    "type": "category",
    "operation": "category",
    "k_symbol": "category",
    "bank": "string",
    "account": "string"
}, parse_dates=["date"],
   date_format="%y%m%d")

transaction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056320 entries, 0 to 1056319
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype         
---  ------      --------------    -----         
 0   trans_id    1056320 non-null  int64         
 1   account_id  1056320 non-null  int64         
 2   date        1056320 non-null  datetime64[ns]
 3   type        1056320 non-null  category      
 4   operation   873206 non-null   category      
 5   amount      1056320 non-null  float64       
 6   balance     1056320 non-null  float64       
 7   k_symbol    574439 non-null   category      
 8   bank        273508 non-null   string        
 9   account     295389 non-null   string        
dtypes: category(3), datetime64[ns](1), float64(2), int64(2), string(2)
memory usage: 59.4 MB

transaction_df.head()

transaction_df["type"] = transaction_df["type"].cat.rename_categories({
    "PRIJEM": "credit",
    "VYDAJ": "withdrawal"
})

transaction_df["operation"] = transaction_df["operation"].cat.rename_categories({
    "VYBER KARTOU": "credit_card_withdrawal",
    "VKLAD": "credit_in_cash",
    "PREVOD Z UCTU": "collection_from_another_bank",
    "VYBER": "withdrawal_in_cash",
    "PREVOD NA UCET": "remittance_to_another_bank"
})


transaction_df["k_symbol"] = transaction_df["k_symbol"].cat.rename_categories({
    "POJISTNE": "insurance_payment",
    "SLUZBY": "payment_for_statement",
    "UROK": "interest_credited",
    "SANKC. UROK": "sanction_interest_negative_balance",
    "SIPO": "household_payment",
    "DUCHOD": "oldage_pension",
    "UVER": "loan_payment"
})

for col in transaction_df.select_dtypes(include='category'):
    print(f"\nSpalte: {col}")
    print(f"Anzahl eindeutiger Werte: {transaction_df[col].nunique()}")
    print(transaction_df[col].cat.categories.tolist())

Spalte: type
Anzahl eindeutiger Werte: 3
['credit', 'VYBER', 'withdrawal']

Spalte: operation
Anzahl eindeutiger Werte: 5
['remittance_to_another_bank', 'collection_from_another_bank', 'credit_in_cash', 'withdrawal_in_cash', 'credit_card_withdrawal']

Spalte: k_symbol
Anzahl eindeutiger Werte: 8
[' ', 'oldage_pension', 'insurance_payment', 'sanction_interest_negative_balance', 'household_payment', 'payment_for_statement', 'interest_credited', 'loan_payment']

transaction_df.query("type == 'VYBER'")["operation"].value_counts()

operation
withdrawal_in_cash              16666
remittance_to_another_bank          0
collection_from_another_bank        0
credit_in_cash                      0
credit_card_withdrawal              0
Name: count, dtype: int64

transaction_df["type"] = transaction_df["type"].replace("VYBER" , "withdrawal")

for col in transaction_df.select_dtypes(include="category"):
    print(f"\nSpalte: {col}")
    print(f"Anzahl eindeutiger Werte: {transaction_df[col].nunique()}")
    print(f"Kategorien: {transaction_df[col].cat.categories.tolist()}")

Spalte: type
Anzahl eindeutiger Werte: 2
Kategorien: ['credit', 'withdrawal']

Spalte: operation
Anzahl eindeutiger Werte: 5
Kategorien: ['remittance_to_another_bank', 'collection_from_another_bank', 'credit_in_cash', 'withdrawal_in_cash', 'credit_card_withdrawal']

Spalte: k_symbol
Anzahl eindeutiger Werte: 8
Kategorien: [' ', 'oldage_pension', 'insurance_payment', 'sanction_interest_negative_balance', 'household_payment', 'payment_for_statement', 'interest_credited', 'loan_payment']

transaction_df.isna().mean().mul(100).round(1)

trans_id      0.00
account_id    0.00
date          0.00
type          0.00
operation    17.30
amount        0.00
balance       0.00
k_symbol     45.60
bank         74.10
account      72.00
dtype: float64

# 'unknown' als Kategorie in 'operation' hinzufügen, falls noch nicht vorhanden
if "unknown" not in transaction_df['operation'].cat.categories:
    transaction_df['operation'] = transaction_df['operation'].cat.add_categories("unknown")

# Leere Strings und NaN in 'operation' durch 'unknown' ersetzen
transaction_df['operation'] = transaction_df['operation'].replace(' ', 'unknown')
transaction_df['operation'] = transaction_df['operation'].fillna('unknown')

# 'unknown' als Kategorie in 'k_symbol' hinzufügen, falls noch nicht vorhanden
if "unknown" not in transaction_df['k_symbol'].cat.categories:
    transaction_df['k_symbol'] = transaction_df['k_symbol'].cat.add_categories("unknown")

# Leere Strings und NaN in 'k_symbol' durch 'unknown' ersetzen
transaction_df['k_symbol'] = transaction_df['k_symbol'].replace(' ', 'unknown')
transaction_df['k_symbol'] = transaction_df['k_symbol'].fillna('unknown')

# Entfernen der Spalten 'bank' und 'account', falls vorhanden
transaction_df.drop(columns=['bank', 'account'], inplace=True, errors='ignore')

transaction_df.isna().mean().mul(100).round(1)

trans_id     0.00
account_id   0.00
date         0.00
type         0.00
operation    0.00
amount       0.00
balance      0.00
k_symbol     0.00
dtype: float64

for col in transaction_df.select_dtypes(include='category'):
    print(f"\nSpalte: {col}")
    print(f"Anzahl eindeutiger Werte: {transaction_df[col].nunique()}")
    print(transaction_df[col].cat.categories.tolist())

Spalte: type
Anzahl eindeutiger Werte: 2
['credit', 'withdrawal']

Spalte: operation
Anzahl eindeutiger Werte: 6
['remittance_to_another_bank', 'collection_from_another_bank', 'credit_in_cash', 'withdrawal_in_cash', 'credit_card_withdrawal', 'unknown']

Spalte: k_symbol
Anzahl eindeutiger Werte: 8
['oldage_pension', 'insurance_payment', 'sanction_interest_negative_balance', 'household_payment', 'payment_for_statement', 'interest_credited', 'loan_payment', 'unknown']

transaction_df["amount"].describe().apply(lambda x: round(x, 2))

count   1056320.00
mean       5924.15
std        9522.74
min           0.00
25%         135.90
50%        2100.00
75%        6800.00
max       87400.00
Name: amount, dtype: float64

transaction_df.loc[transaction_df["type"] == "withdrawal", "amount"] *= -1

(transaction_df.loc[transaction_df["type"] == "withdrawal", "amount"] > 0).sum()

np.int64(0)

transaction_df['trans_id'].is_unique

True

transaction_df.describe(include='all')

# Aufteilung der Transaktionen in Einzahlungen (>0) und Abhebungen (<0)
deposits_df = transaction_df[transaction_df['amount'] > 0]
withdrawals_df = transaction_df[transaction_df['amount'] < 0]

# Subplot mit Boxplot und Histogramm für Einzahlungen
fig = make_subplots(rows=2, cols=1,
                    shared_xaxes=True,
                    row_heights=[0.2, 0.8],
                    vertical_spacing=0.02)

fig.add_trace(go.Box(x=deposits_df["amount"], name="Boxplot", marker_color="red", orientation='h'), row=1, col=1)
fig.add_trace(go.Histogram(x=deposits_df["amount"], nbinsx=100, name="Histogramm", marker_color="red"), row=2, col=1)

fig.update_layout(
    height=600,
    title_text="Deposits Distribution: Box Plot and Histogram",
    showlegend=True,
    xaxis_title="Transaction Amount (CZK)",
    bargap=0.05
)

fig.show()

# Subplot mit Boxplot und Histogramm für Abhebungen (Beträge absolut)
fig = make_subplots(rows=2, cols=1,
                    shared_xaxes=True,
                    row_heights=[0.2, 0.8],
                    vertical_spacing=0.02)

fig.add_trace(go.Box(x=withdrawals_df["amount"].abs(), name="Boxplot", marker_color="blue", orientation='h'), row=1, col=1)
fig.add_trace(go.Histogram(x=withdrawals_df["amount"].abs(), nbinsx=100, name="Histogramm", marker_color="blue"), row=2, col=1)

fig.update_layout(
    height=600,
    title_text="Withdrawals Distribution: Box Plot and Histogram (Absolute Values)",
    showlegend=True,
    bargap=0.05
)

fig.update_xaxes(title_text="Withdrawal Amount (CZK, abs.)", row=1, col=1)
fig.update_xaxes(title_text="Withdrawal Amount (CZK, abs.)", row=2, col=1)
fig.update_xaxes(range=[0, withdrawals_df["amount"].abs().max()], row=1, col=1)
fig.update_xaxes(range=[0, withdrawals_df["amount"].abs().max()], row=2, col=1)

fig.show()

# Prozentsatz der Häufigkeiten in der Spalte 'type' berechnen
type_counts = transaction_df['type'].value_counts(normalize=True).reset_index(name='percent')
type_counts['percent'] = (type_counts['percent'] * 100).round(2)
type_counts.columns = ['type', 'percent']

# Balkendiagramm für die Verteilung der Transaktionstypen
fig_type = px.bar(
    type_counts, 
    x='type', 
    y='percent', 
    title='Distribution of Transaction Types',
    labels={'type': 'Transaction Type', 'percent': 'Percentage (%)'},
    color='type',
    text_auto=True,
)
fig_type.update_traces(
    textfont_size=12,
    textangle=0,
    textposition="outside",
    cliponaxis=False
)
fig_type.show()

# Prozentsatz der Häufigkeiten in der Spalte 'operation' berechnen
operation_counts = transaction_df['operation'].value_counts(normalize=True).reset_index(name='percent')
operation_counts['percent'] = (operation_counts['percent'] * 100).round(2)
operation_counts.columns = ['operation', 'percent']

# Balkendiagramm für die Verteilung der Operationstypen
fig_operation = px.bar(
    operation_counts, 
    x='operation', 
    y='percent', 
    title='Distribution of Operations',
    labels={'operation': 'Operation Type', 'percent': 'Percentage (%)'},
    color='operation',
    text_auto=True,
)
fig_operation.update_traces(
    textfont_size=12,
    textangle=0,
    textposition="outside",
    cliponaxis=False
)
fig_operation.show()

# Subplot mit Boxplot und Histogramm zur Verteilung der Kontostände
fig = make_subplots(rows=2, cols=1,
                    shared_xaxes=True,
                    row_heights=[0.2, 0.8],
                    vertical_spacing=0.02)
fig.add_trace(go.Box(x=transaction_df["balance"], name="Boxplot", marker_color="blue", orientation='h'), row=1, col=1)
fig.add_trace(go.Histogram(x=transaction_df["balance"], nbinsx=100, name="Histogramm", marker_color="blue"), row=2, col=1)
fig.update_layout(
    height=600,
    title_text="Balance Distribution: Box Plot and Histogram",
    showlegend=True,
    bargap=0.05
)
fig.show()

# Prozentsatz der Häufigkeiten in der Spalte 'k_symbol' berechnen
type_counts = transaction_df['k_symbol'].value_counts(normalize=True).reset_index(name='percent')
type_counts['percent'] = (type_counts['percent'] * 100).round(2)
type_counts.columns = ['k_symbol', 'percent']

# Balkendiagramm für die Verteilung der 'k_symbol'-Kategorien
fig_type = px.bar(
    type_counts, 
    x='k_symbol', 
    y='percent', 
    title='Distribution of K_Symbol: Characterization of Transaction',
    labels={'k_symbol': 'Characterization of Transaction', 'percent': 'Percentage (%)'},
    color='k_symbol',
    text_auto=True,
)

fig_type.update_traces(
    textfont_size=12,
    textangle=0,
    textposition="outside",
    cliponaxis=False
)

fig_type.show()

# 'date' als datetime konvertieren
transaction_df['date'] = pd.to_datetime(transaction_df['date'])

# Jahr-Monat als Periodenstring extrahieren
transaction_df['year_month'] = transaction_df['date'].dt.to_period('M').astype(str)

# Monatliche Summe der Transaktionsbeträge berechnen
monthly_sum = transaction_df.groupby('year_month')['amount'].sum().reset_index()
monthly_sum['year_month'] = pd.to_datetime(monthly_sum['year_month'])

# Linienplot der monatlichen Transaktionssummen
fig = px.line(monthly_sum,
              x='year_month',
              y='amount',
              labels={'year_month': 'Monat', 'amount': 'Transaktionssumme (CZK)'},
              title="Monatliche Transaktionssumme (1993–1998)")

# Achsen-Formatierung für bessere Lesbarkeit
fig.update_layout(
    xaxis=dict(
        dtick="M3",             # Tick alle 3 Monate
        tickformat="%b %Y",     # Format: Jan 1993
        tickangle=45            # Beschriftung schräg
    ),
    title_text="Monatliche Transaktionssumme (1993–1998)",
    yaxis_title="Betrag (CZK)",
    xaxis_title="Monat",
    height=500
)

fig.show()

# Maske für die Konten 14 und 18 erstellen
mask = transaction_df["account_id"].isin([14, 18])

# Gefilterte Kopie mit neuer Spalte 'month' als Jahr-Monat-Periode
tx_sel = (
    transaction_df.loc[mask]
                  .copy()  # Eigene Kopie, um SettingWithCopyWarning zu vermeiden
                  .assign(month=lambda df: df["date"].dt.to_period("M"))
)

# Umsatz (saldiert)
umsatz = (
    tx_sel.groupby(["account_id", "month"])["amount"]
          .sum()
          .rename("umsatz")
)

# Monatsend-Saldo
saldo = (
    tx_sel.sort_values("date")
         .groupby(["account_id", "month"])["balance"]
         .last()
         .rename("saldo")
)

# Zusammenführen der Salden und Umsätze
monthly = pd.concat([umsatz, saldo], axis=1).reset_index()

# Umsätze getrennt positiv und negativ berechnen
umsatz_pos = (
    tx_sel[tx_sel["amount"] > 0]
    .groupby(["account_id", "month"])["amount"]
    .sum()
    .rename("umsatz_pos")
)

umsatz_neg = (
    tx_sel[tx_sel["amount"] < 0]
    .groupby(["account_id", "month"])["amount"]
    .sum()
    .abs()
    .rename("umsatz_neg")
)

# Alle in monthly einfügen
monthly = monthly.set_index(["account_id", "month"])
monthly = monthly.join([umsatz_pos, umsatz_neg]).fillna(0).reset_index()

# 1) Plot Monatsend-Saldo
plt.figure(figsize=(12,5))
for acc_id, grp in monthly.groupby("account_id"):
    plt.plot(grp["month"].dt.to_timestamp(), grp["saldo"], label=f"Saldo Konto {acc_id}")
plt.title("Monatsend-Saldo")
plt.xlabel("Monat")
plt.ylabel("Saldo (CZK)")
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# 2) Plot nur Einzahlungen
plt.figure(figsize=(12,5))
for acc_id, grp in monthly.groupby("account_id"):
    plt.plot(grp["month"].dt.to_timestamp(), grp["umsatz_pos"], label=f"Einzahlungen Konto {acc_id}")
plt.title("Monatliche Einzahlungen")
plt.xlabel("Monat")
plt.ylabel("Betrag (CZK)")
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# 3) Plot nur Auszahlungen
plt.figure(figsize=(12,5))
for acc_id, grp in monthly.groupby("account_id"):
    plt.plot(grp["month"].dt.to_timestamp(), grp["umsatz_neg"], label=f"Auszahlungen Konto {acc_id}")
plt.title("Monatliche Auszahlungen")
plt.xlabel("Monat")
plt.ylabel("Betrag (CZK)")
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

example_year = 1997
# Filtere Transaktionen für das Beispieljahr
tx_1997 = tx_sel[tx_sel["date"].dt.year == example_year]

# Formatter zur Anzeige nur des Monats als Zahl auf der x-Achse
def month_formatter(x, pos=None):
    dt = mdates.num2date(x)
    return str(dt.month)

for acc_id in [14, 18]:
    data = tx_1997[tx_1997["account_id"] == acc_id]
    
    # Tagesweise Summen berechnen
    daily_sum = data.groupby("date")["amount"].sum()
    daily_pos = data[data["amount"] > 0].groupby("date")["amount"].sum()
    daily_neg = data[data["amount"] < 0].groupby("date")["amount"].sum().abs()
    daily_balance = data.groupby("date")["balance"].last()  # Tagesend-Vermögen
    
    for vals, label, color in [
        (daily_sum, "Gesamtumsatz", None),
        (daily_pos, "Einzahlungen", "green"),
        (daily_neg, "Auszahlungen", "red"),
        (daily_balance, "Tägliches Vermögen", "blue"),
    ]:
        plt.figure(figsize=(14,4))
        plt.plot(vals.index, vals.values, label=label, color=color)
        plt.title(f"{label} Konto {acc_id} im Jahr {example_year}")
        plt.xlabel("Monat")
        plt.ylabel("Betrag (CZK)")
        plt.grid(True)
        plt.legend()
        
        ax = plt.gca()
        ax.xaxis.set_major_locator(mdates.MonthLocator())
        ax.xaxis.set_major_formatter(FuncFormatter(month_formatter))
        
        plt.xticks(rotation=0)
        plt.tight_layout()
        plt.show()

# Ausgangstabelle: client_df
merged_df_static = client_df

# Verbindung mit der disposition_df-Tabelle per 'client_id'
merged_df_static = pd.merge(merged_df_static, disposition_df, on='client_id', how='left')

# Verbindung mit der credit_card_df-Tabelle per 'disp_id'
merged_df_static = pd.merge(merged_df_static, credit_card_df, on='disp_id', how='left')

# Verbindung mit der account_df-Tabelle per 'account_id'
merged_df_static = pd.merge(merged_df_static, accounts_df, on='account_id', how='left')

merged_df_static.head(10)

merged_df_static.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5369 entries, 0 to 5368
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   client_id      5369 non-null   int64         
 1   birth_number   5369 non-null   datetime64[ns]
 2   district_id_x  5369 non-null   int64         
 3   gender         5369 non-null   object        
 4   age            5369 non-null   int64         
 5   disp_id        5369 non-null   int64         
 6   account_id     5369 non-null   int64         
 7   type_x         5369 non-null   category      
 8   card_id        892 non-null    float64       
 9   type_y         892 non-null    category      
 10  issued         892 non-null    datetime64[ns]
 11  district_id_y  5369 non-null   int64         
 12  frequency      5369 non-null   category      
 13  date           5369 non-null   datetime64[ns]
dtypes: category(3), datetime64[ns](3), float64(1), int64(6), object(1)
memory usage: 477.6+ KB

merged_df_static.nunique()

client_id        5369
birth_number     4738
district_id_x      77
gender              2
age                77
disp_id          5369
account_id       4500
type_x              2
card_id           892
type_y              3
issued            607
district_id_y      77
frequency           3
date             1535
dtype: int64

merged_df_static

merged_df_static.rename(columns={'district_id_x': 'district_id_client', 'district_id_y': 'disctrict_id_account', 'type_x': 'disposition_type', 'type_y': 'credit_card_type'}, inplace=True)

merged_df_static.head()

# Verbindung mit district_df basierend auf client_df
# Merge basierend auf der 'client_district_id'
merged_df_static = pd.merge(
    merged_df_static, 
    district_df, 
    left_on='district_id_client', 
    right_on='district_id', 
    how='left', 
    suffixes=('', '_client')
)

# Lösche die redundante district_id Spalte
merged_df_static.drop('district_id', axis=1, inplace=True)

merged_df_static.head()

merged_df_static.columns

Index(['client_id', 'birth_number', 'district_id_client', 'gender', 'age',
       'disp_id', 'account_id', 'disposition_type', 'card_id',
       'credit_card_type', 'issued', 'disctrict_id_account', 'frequency',
       'date', 'district_name', 'region', 'n_inhabitants',
       'n_municipals_lower_499', 'n_municipals_between_500_1999',
       'n_municipals_between_2000_9999', 'n_municipals_higher_10000',
       'n_cities', 'ratio_urban_inhabitants', 'avg_salary',
       'unemployment_rate_1995', 'unemployment_rate_1996',
       'n_enterpreneurs_per_1k_inhabitants', 'n_crimes_1995', 'n_crimes_1996'],
      dtype='object')

cc_owner_df = merged_df_static[merged_df_static['card_id'].notna()]

non_cc_owner_df = merged_df_static[merged_df_static['card_id'].isna()]

cc_owner_df.head(10)

non_cc_owner_df.head(10)

# Zusammenfassung der Anzahl von Kreditkarteninhabern und Nichtinhabern mit Prozentanteilen
cc_summary_df = pd.DataFrame({
    "Group": ["Kreditkarten-Inhaber", "Kreditkarten-Nichtinhaber"],
    "Count": [len(cc_owner_df), len(non_cc_owner_df)],
})
cc_summary_df["percent"] = ((cc_summary_df["Count"] / cc_summary_df["Count"].sum()) * 100).round(2)

# Balkendiagramm zur Verteilung der beiden Kundengruppen
fig = px.bar(
    cc_summary_df,
    x='Group',
    y='percent', 
    text='percent',  
    title='Verteilung der Kreditkartenkäufer und Nicht-Käufer',
    labels={'Group': 'Kundengruppe', 'percent': 'Prozent'},
    color='Group',  
    text_auto=True
)

fig.update_traces(
    textfont_size=12,
    textangle=0,
    textposition="outside", 
    cliponaxis=False
)

fig.show()

# Filteren OWNER Einträge
owner_df = merged_df_static[merged_df_static['disposition_type'] == 'OWNER']
num_owners = len(owner_df)
print(f'Anzahl Typ OWNER: {num_owners}')

# Filtern OWNER mit Kreditkarte
owner_with_card_df = owner_df[owner_df['card_id'].notna()]
num_owner_with_cc=len(owner_with_card_df)
print(f'Anzahl Typ OWNER mit Kreditkarte: {num_owner_with_cc}')

print(f"Besitzen alle 'owner' eine Kreditkarte? {num_owners == num_owner_with_cc}")

Anzahl Typ OWNER: 4500
Anzahl Typ OWNER mit Kreditkarte: 892
Besitzen alle 'owner' eine Kreditkarte? False

# Anzahl der OWNER ohne Kreditkarte
owner_without_card_df = owner_df[owner_df['card_id'].isna()]
num_owner_without_cc = len(owner_without_card_df)

owner_card_counts = pd.DataFrame({
    'category': ['With Credit Card', 'Without Credit Card'],
    'count': [num_owner_with_cc, num_owner_without_cc]
})

owner_card_counts['percent'] = (owner_card_counts['count'] / owner_card_counts['count'].sum() * 100).round(1)

fig = px.bar(
    owner_card_counts, 
    x='category', 
    y='percent', 
    text_auto='.1f', 
    color='category', 
    hover_data=['count'],
    title='Distribution of OWNER with and without Credit Card',
    labels={'category': 'Category', 'percent': 'Percentage (%)'}
)

fig.update_traces(textposition='outside', cliponaxis=False)
fig.show()

# Filteren DISPONENT Einträge
disponent_df = merged_df_static[merged_df_static['disposition_type'] == 'DISPONENT']
num_disponents = len(disponent_df)
print(f'Anzahl Typ DISPONENT: {num_disponents}')

# Filtern DISPONENT mit Kreditkarte
disponent_with_card_df = disponent_df[disponent_df['card_id'].notna()]
num_disponent_with_cc = len(disponent_with_card_df)
print(f'Anzahl Typ DISPONENT mit Kreditkarte: {num_disponent_with_cc}')

print(f"Besitzen DISPONENT Einträge eine Kreditkarte? {num_disponents == num_disponent_with_cc}")

Anzahl Typ DISPONENT: 869
Anzahl Typ DISPONENT mit Kreditkarte: 0
Besitzen DISPONENT Einträge eine Kreditkarte? False

# Filtern nach Junior-Karten
junior_cards_df = merged_df_static[merged_df_static['credit_card_type'] == 'junior']

fig = px.histogram(
    junior_cards_df, 
    x='age', 
    title='Age Distribution of Junior Credit Cards',
    labels={'age': 'Age'},
    nbins=20  # Anzahl der Bins anpassen, wenn nötig
)

fig.show()

print(f"Maximalalter Besitzer junior-Card: {max(junior_cards_df['age'])}")

Maximalalter Besitzer junior-Card: 24

# Vor dem Mergen
unique_clients_before = len(disposition_df['client_id'].unique())
print(f"Einzigartige client_id im disposition_df vor dem Merge: {unique_clients_before}")

# Nach dem Meergen
unique_clients_after = len(merged_df_static['client_id'].unique())
print(f"Einzigartige client_id nach dem Merge: {unique_clients_after}")

Einzigartige client_id im disposition_df vor dem Merge: 5369
Einzigartige client_id nach dem Merge: 5369

# Anzahl der eindeutigen disposition_types pro client_id
multi_roles_per_client = merged_df_static.groupby('client_id')['disposition_type'].nunique()

# Clients mit mehr als einer Rolle
clients_with_multiple_roles = multi_roles_per_client[multi_roles_per_client > 1]

# Anzahl der Clients mit einer Rolle
clients_with_single_role = multi_roles_per_client[multi_roles_per_client == 1]

print(f"Anzahl der Clients mit mehreren Rollen: {len(clients_with_multiple_roles)}")
print(f"Anzahl der Clients mit einer Rolle: {len(clients_with_single_role)}")

# Zusammenfassung der Ergebnisse
role_counts_client = pd.Series(
    {
        'Single Role': len(clients_with_single_role),
        'Multiple Roles': len(clients_with_multiple_roles)
    }
).reset_index()

role_counts_client.columns = ['Role Type', 'Count']
role_counts_client['Percent'] = (role_counts_client['Count'] / role_counts_client['Count'].sum() * 100).round(1)

# Visualisierung
fig = px.bar(
    role_counts_client, 
    x='Role Type', 
    y='Percent', 
    text='Percent',  
    title='Distribution of Clients with Single and Multiple Roles',
    labels={'Role Type': 'Role Type', 'Percent': 'Percentage (%)'},
    color='Role Type',
    color_discrete_map={'Single Role': 'blue', 'Multiple Roles': 'red'},
    hover_data={'Role Type': True, 'Count': True}
)

fig.update_traces(textposition='outside', cliponaxis=False)
fig.show()

Anzahl der Clients mit mehreren Rollen: 0
Anzahl der Clients mit einer Rolle: 5369

# Berechnung der Kreditkarten als Prozent der Einwohner
cards_per_district = (
    merged_df_static.groupby('district_id_client')
    .agg(num_cards=('card_id', 'count'), n_inhabitants=('n_inhabitants', 'first'))
    .reset_index()
)


# Berechnung des Anteils der Kreditkarten an der Einwohnerzahl in Prozent
cards_per_district['cards_per_inhabitants_percent'] = (
    cards_per_district['num_cards'] / cards_per_district['n_inhabitants'].replace(0, np.nan)
) * 100

# Visualisierung - als Prozent
fig = px.bar(
    cards_per_district, 
    x='district_id_client', 
    y='cards_per_inhabitants_percent', 
    title='Percentage of Credit Cards relative to Inhabitants by District',
    labels={
        'district_id_client': 'District ID',
        'cards_per_inhabitants_percent': 'Percentage of Credit Cards (%)'
    },
    color='cards_per_inhabitants_percent', 
    color_continuous_scale='Viridis'
)

fig.update_layout(
    xaxis_title='District ID',
    yaxis_title='Percentage of Credit Cards relative to Inhabitants (%)',
    showlegend=False
)

fig.show()

# Temporäre Kopie mit Spalten 'frequency' und 'card_id'
temp_df = merged_df_static[['frequency', 'card_id']].copy()

# Neue Spalte: 1 wenn Karte vorhanden, sonst 0
temp_df['has_card'] = temp_df['card_id'].notna().astype(int)

# Gruppierung nach 'frequency' und 'has_card' mit Zählung
frequency_card_counts = temp_df.groupby(['frequency', 'has_card']).size().reset_index(name='count')

# Prozentuale Anteile innerhalb jeder Frequenzgruppe berechnen
frequency_card_counts['percent'] = (
    frequency_card_counts['count'] / frequency_card_counts.groupby('frequency')['count'].transform('sum') * 100
).round(1)

# Lesbare Beschriftung der Kreditkartenbesitz-Status
frequency_card_counts['card_status'] = frequency_card_counts['has_card'].map({1: 'Yes', 0: 'No'})

# Gruppiertes Balkendiagramm nach Frequenz und Kreditkartenbesitz
fig = px.bar(
    frequency_card_counts, 
    x='frequency', 
    y='percent', 
    color='card_status',
    title='Credit Card Ownership by Frequency Group',
    labels={'frequency': 'Frequency', 'percent': 'Percentage (%)', 'card_status': 'Has Credit Card'},
    barmode='group', 
    color_discrete_map={'Yes': 'green', 'No': 'red'}
)

fig.show()

merged_df_static = merged_df_static[merged_df_static['age'] >= 25]

merged_df_static.head(10)

all_owners = (merged_df_static['disposition_type'] == 'OWNER').all()
print(f"Alle Zeilen sind OWNER: {all_owners}")

Alle Zeilen sind OWNER: False

# Nur Zeilen mit 'disposition_type' == 'OWNER' behalten
merged_df_static = merged_df_static[merged_df_static['disposition_type'] == 'OWNER']

# Kategorien auf tatsächlich vorhandene Werte einschränken
merged_df_static['disposition_type'] = merged_df_static['disposition_type'].cat.remove_unused_categories()

print(merged_df_static['disposition_type'].unique())

['OWNER']
Categories (1, object): ['OWNER']

merged_df_static.head(10)

# Zielvariable erstellen (1 für Kreditkartenbesitzer, 0 für Nicht-Käufer)
merged_df_static['has_cc'] = merged_df_static['card_id'].notna().astype(int)

# Überprüfen der neuen Zielvariablen
merged_df_static.head(10)

buyers_df = merged_df_static[merged_df_static['has_cc'] == 1]

buyers_df['cc_purchase_date'] = buyers_df['issued']
buyers_df.drop(columns=['issued'], inplace=True, errors='ignore')

C:\Users\grego\AppData\Local\Temp\ipykernel_23588\2093553395.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\grego\AppData\Local\Temp\ipykernel_23588\2093553395.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

buyers_df.head()

buyers_df.columns

Index(['client_id', 'birth_number', 'district_id_client', 'gender', 'age',
       'disp_id', 'account_id', 'disposition_type', 'card_id',
       'credit_card_type', 'disctrict_id_account', 'frequency', 'date',
       'district_name', 'region', 'n_inhabitants', 'n_municipals_lower_499',
       'n_municipals_between_500_1999', 'n_municipals_between_2000_9999',
       'n_municipals_higher_10000', 'n_cities', 'ratio_urban_inhabitants',
       'avg_salary', 'unemployment_rate_1995', 'unemployment_rate_1996',
       'n_enterpreneurs_per_1k_inhabitants', 'n_crimes_1995', 'n_crimes_1996',
       'has_cc', 'cc_purchase_date'],
      dtype='object')

merged_df_dynamic = accounts_df

merged_df_dynamic = pd.merge(merged_df_dynamic, order_df, on='account_id', how='left')
merged_df_dynamic = pd.merge(merged_df_dynamic, transaction_df, on='account_id', how='left')
merged_df_dynamic = pd.merge(merged_df_dynamic, loan_df, on='account_id', how='left')

merged_df_dynamic.head()

merged_df_dynamic.columns

Index(['account_id', 'district_id', 'frequency', 'date_x', 'order_id',
       'bank_to', 'account_to', 'amount_x', 'k_symbol_x', 'trans_id', 'date_y',
       'type', 'operation', 'amount_y', 'balance', 'k_symbol_y', 'year_month',
       'loan_id', 'date', 'amount', 'duration', 'payments', 'status'],
      dtype='object')

merged_df_dynamic.rename(columns={
    'date_x': 'account_date', 
    'date_y': 'trans_date', 
    'date': 'loan_date', 
    'amount_x': 'order_amount', 
    'amount_y': 'trans_amount',
    'amount': 'loan_amount',
    'k_symbol_x': 'k_symbol_order',
    'k_symbol_y': 'k_symbol_trans'
    }, inplace=True)

merged_df_dynamic.head()

merged_df_dynamic.describe(include='all')

# Füge 'client_id' aus merged_df_static hinzu, basierend auf 'account_id'
merged_df_dynamic = merged_df_dynamic.merge(merged_df_static[['account_id', 'client_id']], 
                                            on='account_id', 
                                            how='left')

# Überprüfe, ob 'client_id' jetzt in merged_df_dynamic vorhanden ist
print(merged_df_dynamic.columns)

Index(['account_id', 'district_id', 'frequency', 'account_date', 'order_id',
       'bank_to', 'account_to', 'order_amount', 'k_symbol_order', 'trans_id',
       'trans_date', 'type', 'operation', 'trans_amount', 'balance',
       'k_symbol_trans', 'year_month', 'loan_id', 'loan_date', 'loan_amount',
       'duration', 'payments', 'status', 'client_id'],
      dtype='object')

merged_df_dynamic.head()

# Für alle Spalten aussser 'account_id' prüfen, ob mehr als ein unique Wert pro account_id existiert
unique_counts = merged_df_dynamic.groupby('account_id').nunique()

# Für jede Spalte prüfen, ob es mindestens einen account_id gibt mit mehr als einem unique Wert
for col in unique_counts.columns:
    if (unique_counts[col] > 1).any():
        print(f"Es gibt Accounts mit mehr als einem einzigartigen Wert in der Spalte '{col}'.")
    else:
        print(f"Alle Accounts haben nur einen einzigartigen Wert in der Spalte '{col}'.")

Alle Accounts haben nur einen einzigartigen Wert in der Spalte 'district_id'.
Alle Accounts haben nur einen einzigartigen Wert in der Spalte 'frequency'.
Alle Accounts haben nur einen einzigartigen Wert in der Spalte 'account_date'.
Es gibt Accounts mit mehr als einem einzigartigen Wert in der Spalte 'order_id'.
Es gibt Accounts mit mehr als einem einzigartigen Wert in der Spalte 'bank_to'.
Es gibt Accounts mit mehr als einem einzigartigen Wert in der Spalte 'account_to'.
Es gibt Accounts mit mehr als einem einzigartigen Wert in der Spalte 'order_amount'.
Es gibt Accounts mit mehr als einem einzigartigen Wert in der Spalte 'k_symbol_order'.
Es gibt Accounts mit mehr als einem einzigartigen Wert in der Spalte 'trans_id'.
Es gibt Accounts mit mehr als einem einzigartigen Wert in der Spalte 'trans_date'.
Es gibt Accounts mit mehr als einem einzigartigen Wert in der Spalte 'type'.
Es gibt Accounts mit mehr als einem einzigartigen Wert in der Spalte 'operation'.
Es gibt Accounts mit mehr als einem einzigartigen Wert in der Spalte 'trans_amount'.
Es gibt Accounts mit mehr als einem einzigartigen Wert in der Spalte 'balance'.
Es gibt Accounts mit mehr als einem einzigartigen Wert in der Spalte 'k_symbol_trans'.
Es gibt Accounts mit mehr als einem einzigartigen Wert in der Spalte 'year_month'.
Alle Accounts haben nur einen einzigartigen Wert in der Spalte 'loan_id'.
Alle Accounts haben nur einen einzigartigen Wert in der Spalte 'loan_date'.
Alle Accounts haben nur einen einzigartigen Wert in der Spalte 'loan_amount'.
Alle Accounts haben nur einen einzigartigen Wert in der Spalte 'duration'.
Alle Accounts haben nur einen einzigartigen Wert in der Spalte 'payments'.
Alle Accounts haben nur einen einzigartigen Wert in der Spalte 'status'.
Alle Accounts haben nur einen einzigartigen Wert in der Spalte 'client_id'.

def get_rollup_window_data(
        df: pd.DataFrame,
        buy_dates: pd.DataFrame,
        rollup_window: int,
        date_column: str = "cc_purchase_date") -> pd.DataFrame:
    """
    Aggregiert Kennzahlen je (client_id, Kaufdatum) innerhalb eines Rollup-Fensters.

    Enthalten sind:
    - Transaktions-, Kredit- und Dauerauftrags-Kennzahlen
    - KEINE Dummy-Variablen für Kategorien, sondern je Gruppe eine String-Spalte mit dominierender Klasse
      (transaction_type, transaction_operation, trans_k_symbol, order_k_symbol)
    - Höchstens ein Kredit pro Kunde/Fenster
    - Rückgabe ohne NaN: numerische Werte mit 0, Strings mit 'missing' (oder leer zu 'missing')
    """

    # Alias-Spaltennamen
    TRANS_DATE, LOAN_DATE = "trans_date", "loan_date"
    TRANS_AMT,  LOAN_AMT  = "trans_amount", "loan_amount"
    ORDER_AMT             = "order_amount"
    BALANCE               = "balance"

    rows = []

    for _, br in buy_dates.iterrows():
        cid, buy_dt = br["client_id"], br[date_column]
        start_dt = buy_dt - pd.Timedelta(days=rollup_window)
        sub = df[df["client_id"] == cid]

        # Transaktionen im Fenster filtern
        t = sub[
            sub[TRANS_DATE].notna() &
            sub[TRANS_DATE].between(start_dt, buy_dt, inclusive="left")
        ]

        # Transaktionskennzahlen berechnen
        t_stats = {
            "total_spent"      : t[TRANS_AMT].sum(),
            "num_transactions" : len(t),
            "avg_balance"      : t[BALANCE].mean(),
            "max_balance"      : t[BALANCE].max(),
            "min_balance"      : t[BALANCE].min(),
            "std_balance"      : t[BALANCE].std(ddof=0),
            "avg_trans_amount" : t[TRANS_AMT].mean(),
            "med_trans_amount" : t[TRANS_AMT].median(),
            "max_trans_amount" : t[TRANS_AMT].max(),
            "min_trans_amount" : t[TRANS_AMT].min(),
            "std_trans_amount" : t[TRANS_AMT].std(ddof=0),
            "balance_before_cc": (
                t.sort_values(TRANS_DATE).iloc[-1][BALANCE] if not t.empty else 0
            ),
            # Modus als dominante Kategorie (leere als "")
            "transaction_type"      : t["type"].mode().iat[0] if not t["type"].dropna().empty else "",
            "transaction_operation" : t["operation"].mode().iat[0] if not t["operation"].dropna().empty else "",
            "trans_k_symbol"        : t["k_symbol_trans"].mode().iat[0] if not t["k_symbol_trans"].dropna().empty else "",
        }

        # Kredite (max 1) filtern und Kennzahlen setzen
        loans = sub[
            sub[LOAN_DATE].notna() &
            sub[LOAN_DATE].between(start_dt, buy_dt, inclusive="left")
        ]
        if "loan_id" in loans.columns:
            loans = loans.drop_duplicates("loan_id")

        if loans.empty:
            l_stats = dict(num_loans=0, loan_amount=0, loan_duration=0, loan_payments=0, loan_status="")
        else:
            lo = loans.iloc[0]
            l_stats = dict(
                num_loans=1,
                loan_amount=lo[LOAN_AMT],
                loan_duration=lo["duration"],
                loan_payments=lo["payments"],
                loan_status=str(lo["status"])
            )

        # Daueraufträge filtern und Kennzahlen setzen
        orders = sub[sub[ORDER_AMT].notna()]
        if "order_id" in orders.columns:
            orders = orders.drop_duplicates("order_id")

        o_stats = {
            "num_perm_orders": len(orders),
            "total_order_amount": orders[ORDER_AMT].sum(),
            "avg_order_amount": orders[ORDER_AMT].mean(),
            "order_k_symbol": (
                orders["k_symbol_order"].mode().iat[0] if not orders["k_symbol_order"].dropna().empty else ""
            )
        }

        # Ergebniszeile sammeln
        rows.append({"client_id": cid, **t_stats, **l_stats, **o_stats})

    # DataFrame erstellen und NaN-Werte ersetzen
    rollup_df = pd.DataFrame(rows)

    num_cols = rollup_df.select_dtypes(include="number").columns
    obj_cols = rollup_df.select_dtypes(include="object").columns

    rollup_df[num_cols] = rollup_df[num_cols].fillna(0)
    rollup_df[obj_cols] = rollup_df[obj_cols].fillna("missing")
    rollup_df[obj_cols] = rollup_df[obj_cols].replace('', 'missing')

    return rollup_df

buyers_df = buyers_df.reset_index(drop=True)

# Rollup-Daten für Kreditkartenkäufer im definierten Fenster erzeugen
buyers_event_info_df = get_rollup_window_data(
    df=merged_df_dynamic,
    buy_dates=buyers_df[['client_id', 'cc_purchase_date']],
    rollup_window=395,
    date_column='cc_purchase_date'
)

buyers_event_info_df.head(50)

buyers_event_info_df.columns

Index(['client_id', 'total_spent', 'num_transactions', 'avg_balance',
       'max_balance', 'min_balance', 'std_balance', 'avg_trans_amount',
       'med_trans_amount', 'max_trans_amount', 'min_trans_amount',
       'std_trans_amount', 'balance_before_cc', 'transaction_type',
       'transaction_operation', 'trans_k_symbol', 'num_loans', 'loan_amount',
       'loan_duration', 'loan_payments', 'loan_status', 'num_perm_orders',
       'total_order_amount', 'avg_order_amount', 'order_k_symbol'],
      dtype='object')

buyers_event_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 691 entries, 0 to 690
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   client_id              691 non-null    int64  
 1   total_spent            691 non-null    float64
 2   num_transactions       691 non-null    int64  
 3   avg_balance            691 non-null    float64
 4   max_balance            691 non-null    float64
 5   min_balance            691 non-null    float64
 6   std_balance            691 non-null    float64
 7   avg_trans_amount       691 non-null    float64
 8   med_trans_amount       691 non-null    float64
 9   max_trans_amount       691 non-null    float64
 10  min_trans_amount       691 non-null    float64
 11  std_trans_amount       691 non-null    float64
 12  balance_before_cc      691 non-null    float64
 13  transaction_type       691 non-null    object 
 14  transaction_operation  691 non-null    object 
 15  trans_k_symbol         691 non-null    object 
 16  num_loans              691 non-null    int64  
 17  loan_amount            691 non-null    float64
 18  loan_duration          691 non-null    float64
 19  loan_payments          691 non-null    float64
 20  loan_status            691 non-null    object 
 21  num_perm_orders        691 non-null    int64  
 22  total_order_amount     691 non-null    float64
 23  avg_order_amount       691 non-null    float64
 24  order_k_symbol         691 non-null    object 
dtypes: float64(16), int64(4), object(5)
memory usage: 135.1+ KB

non_buyers_df = merged_df_static[merged_df_static['has_cc'] == 0]

np.random.seed(42)

# Schritt 1: Liste der echten Kaufdaten
purchchase_dates = buyers_df['cc_purchase_date'].dropna()

purchchase_dates.head()

# Schritt 2 : Zufälliges Sample für Nicht-Käufer ziehen
non_buyers_df = non_buyers_df.copy()
non_buyers_df['pseudo_purchase_date'] = np.random.choice(purchchase_dates, size=len(non_buyers_df), replace=True)

# Schritt 2: Zufälliges Sample für Nicht-Käufer ziehen
non_buyers_df = non_buyers_df.copy()
non_buyers_df['pseudo_purchase_date'] = np.random.choice(purchchase_dates, size=len(non_buyers_df), replace=True)

non_buyers_df.head()

# Vergleich der Verteilung der tatsächlichen Kaufdaten von Käufern mit Pseudo-Kaufdaten von Nicht-Käufern

fig = make_subplots(rows=1, cols=2, subplot_titles=("Buyers - Purchase Date", "Non-Buyers - Pseudo Purchase Date"))

fig.add_trace(go.Histogram(
    x=buyers_df['cc_purchase_date'],
    name="Buyers",
    histnorm='probability'  
), row=1, col=1)

fig.add_trace(go.Histogram(
    x=non_buyers_df['pseudo_purchase_date'],
    name="Non-Buyers",
    histnorm='probability'  
), row=1, col=2)

fig.update_layout(
    barmode='overlay',
    showlegend=False,
    title_text="Histogram of Purchase Dates: Buyers vs. Non-Buyers with Pseudo Purchase Dates"
)

fig.show()

# Rollup-Daten für Nicht-Käufer basierend auf pseudo Kaufdatum erstellen
non_buyers_event_info_df = get_rollup_window_data(
    df=merged_df_dynamic,
    buy_dates=non_buyers_df[['client_id', 'pseudo_purchase_date']],
    rollup_window=395,
    date_column='pseudo_purchase_date'
)

non_buyers_event_info_df.head(10)

non_buyers_event_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3094 entries, 0 to 3093
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   client_id              3094 non-null   int64  
 1   total_spent            3094 non-null   float64
 2   num_transactions       3094 non-null   int64  
 3   avg_balance            3094 non-null   float64
 4   max_balance            3094 non-null   float64
 5   min_balance            3094 non-null   float64
 6   std_balance            3094 non-null   float64
 7   avg_trans_amount       3094 non-null   float64
 8   med_trans_amount       3094 non-null   float64
 9   max_trans_amount       3094 non-null   float64
 10  min_trans_amount       3094 non-null   float64
 11  std_trans_amount       3094 non-null   float64
 12  balance_before_cc      3094 non-null   float64
 13  transaction_type       3094 non-null   object 
 14  transaction_operation  3094 non-null   object 
 15  trans_k_symbol         3094 non-null   object 
 16  num_loans              3094 non-null   int64  
 17  loan_amount            3094 non-null   float64
 18  loan_duration          3094 non-null   float64
 19  loan_payments          3094 non-null   float64
 20  loan_status            3094 non-null   object 
 21  num_perm_orders        3094 non-null   int64  
 22  total_order_amount     3094 non-null   float64
 23  avg_order_amount       3094 non-null   float64
 24  order_k_symbol         3094 non-null   object 
dtypes: float64(16), int64(4), object(5)
memory usage: 604.4+ KB

non_buyers_event_info_df.columns

Index(['client_id', 'total_spent', 'num_transactions', 'avg_balance',
       'max_balance', 'min_balance', 'std_balance', 'avg_trans_amount',
       'med_trans_amount', 'max_trans_amount', 'min_trans_amount',
       'std_trans_amount', 'balance_before_cc', 'transaction_type',
       'transaction_operation', 'trans_k_symbol', 'num_loans', 'loan_amount',
       'loan_duration', 'loan_payments', 'loan_status', 'num_perm_orders',
       'total_order_amount', 'avg_order_amount', 'order_k_symbol'],
      dtype='object')

# Käufer: 'month' anhand von Kaufdatum (cc_purchase_date) mappen und in datetime umwandeln
buyers_event_info_df['month'] = pd.to_datetime(
    buyers_event_info_df['client_id'].map(
        buyers_df.set_index('client_id')['cc_purchase_date']
    )
)
# Nur Einträge mit gültigem Datum behalten
buyers_event_info_df = buyers_event_info_df[buyers_event_info_df['month'].notna()]
# Perioden in Timestamp konvertieren (erster Tag des Monats)
buyers_event_info_df['month'] = buyers_event_info_df['month'].dt.to_period('M').dt.to_timestamp()
# Monatliche Mittelwerte berechnen für ausgewählte numerische Features
buyers_monthly = buyers_event_info_df.groupby('month')[['avg_balance', 'avg_trans_amount', 'avg_order_amount']].mean().reset_index()
buyers_monthly['group'] = 'Buyer'  # Label zur Gruppenzuordnung

# Nicht-Käufer: 'month' anhand von Pseudo-Kaufdatum mappen und konvertieren
non_buyers_event_info_df['month'] = pd.to_datetime(
    non_buyers_event_info_df['client_id'].map(
        non_buyers_df.set_index('client_id')['pseudo_purchase_date']
    )
)
non_buyers_event_info_df = non_buyers_event_info_df[non_buyers_event_info_df['month'].notna()]
non_buyers_event_info_df['month'] = non_buyers_event_info_df['month'].dt.to_period('M').dt.to_timestamp()
non_buyers_monthly = non_buyers_event_info_df.groupby('month')[['avg_balance', 'avg_trans_amount', 'avg_order_amount']].mean().reset_index()
non_buyers_monthly['group'] = 'Non-Buyer'

# Käufer- und Nicht-Käufer-Daten zusammenführen
combined = pd.concat([buyers_monthly, non_buyers_monthly])

# Liste der Features, die einzeln geplottet werden sollen
features_to_plot = ['avg_balance', 'avg_trans_amount', 'avg_order_amount']

# Für jede Kennzahl einen separaten Zeitreihenplot erstellen
for feature in features_to_plot:
    plt.figure(figsize=(10, 5))
    sns.lineplot(data=combined, x='month', y=feature, hue='group', marker="o")
    plt.title(f'Zeitverlauf {feature} (Käufer vs. Nicht-Käufer)')
    plt.xlabel('Zeitverlauf')
    plt.ylabel(f'{feature}')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Liste wichtiger numerischer Features für Dichtevergleiche
num_features = ['total_spent', 'avg_balance', 'num_transactions', 'loan_amount']

# Für jedes Feature KDE-Plots erstellen, um Verteilungen von Käufern und Nicht-Käufern zu vergleichen
for feature in num_features:
    plt.figure(figsize=(8,4))
    sns.kdeplot(data=buyers_event_info_df, x=feature, label='Buyer', fill=True)       # Käufer
    sns.kdeplot(data=non_buyers_event_info_df, x=feature, label='Non-Buyer', fill=True) # Nicht-Käufer
    plt.title(f'Verteilung von {feature}')
    plt.legend()
    plt.tight_layout()
    plt.show()

# Kombinieren der DataFrames für Käufer und Nicht-Käufer
combined_event_info_df = pd.concat([buyers_event_info_df, non_buyers_event_info_df], ignore_index=True)

# Überprüfen des kombinierten DataFrames
combined_event_info_df.head(10)

combined_event_info_df.columns

Index(['client_id', 'total_spent', 'num_transactions', 'avg_balance',
       'max_balance', 'min_balance', 'std_balance', 'avg_trans_amount',
       'med_trans_amount', 'max_trans_amount', 'min_trans_amount',
       'std_trans_amount', 'balance_before_cc', 'transaction_type',
       'transaction_operation', 'trans_k_symbol', 'num_loans', 'loan_amount',
       'loan_duration', 'loan_payments', 'loan_status', 'num_perm_orders',
       'total_order_amount', 'avg_order_amount', 'order_k_symbol', 'month'],
      dtype='object')

combined_event_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3785 entries, 0 to 3784
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   client_id              3785 non-null   int64         
 1   total_spent            3785 non-null   float64       
 2   num_transactions       3785 non-null   int64         
 3   avg_balance            3785 non-null   float64       
 4   max_balance            3785 non-null   float64       
 5   min_balance            3785 non-null   float64       
 6   std_balance            3785 non-null   float64       
 7   avg_trans_amount       3785 non-null   float64       
 8   med_trans_amount       3785 non-null   float64       
 9   max_trans_amount       3785 non-null   float64       
 10  min_trans_amount       3785 non-null   float64       
 11  std_trans_amount       3785 non-null   float64       
 12  balance_before_cc      3785 non-null   float64       
 13  transaction_type       3785 non-null   object        
 14  transaction_operation  3785 non-null   object        
 15  trans_k_symbol         3785 non-null   object        
 16  num_loans              3785 non-null   int64         
 17  loan_amount            3785 non-null   float64       
 18  loan_duration          3785 non-null   float64       
 19  loan_payments          3785 non-null   float64       
 20  loan_status            3785 non-null   object        
 21  num_perm_orders        3785 non-null   int64         
 22  total_order_amount     3785 non-null   float64       
 23  avg_order_amount       3785 non-null   float64       
 24  order_k_symbol         3785 non-null   object        
 25  month                  3785 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(16), int64(4), object(5)
memory usage: 769.0+ KB

combined_df = combined_event_info_df.merge(merged_df_static, on="client_id", how="left")

combined_df.head()

combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3785 entries, 0 to 3784
Data columns (total 55 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   client_id                           3785 non-null   int64         
 1   total_spent                         3785 non-null   float64       
 2   num_transactions                    3785 non-null   int64         
 3   avg_balance                         3785 non-null   float64       
 4   max_balance                         3785 non-null   float64       
 5   min_balance                         3785 non-null   float64       
 6   std_balance                         3785 non-null   float64       
 7   avg_trans_amount                    3785 non-null   float64       
 8   med_trans_amount                    3785 non-null   float64       
 9   max_trans_amount                    3785 non-null   float64       
 10  min_trans_amount                    3785 non-null   float64       
 11  std_trans_amount                    3785 non-null   float64       
 12  balance_before_cc                   3785 non-null   float64       
 13  transaction_type                    3785 non-null   object        
 14  transaction_operation               3785 non-null   object        
 15  trans_k_symbol                      3785 non-null   object        
 16  num_loans                           3785 non-null   int64         
 17  loan_amount                         3785 non-null   float64       
 18  loan_duration                       3785 non-null   float64       
 19  loan_payments                       3785 non-null   float64       
 20  loan_status                         3785 non-null   object        
 21  num_perm_orders                     3785 non-null   int64         
 22  total_order_amount                  3785 non-null   float64       
 23  avg_order_amount                    3785 non-null   float64       
 24  order_k_symbol                      3785 non-null   object        
 25  month                               3785 non-null   datetime64[ns]
 26  birth_number                        3785 non-null   datetime64[ns]
 27  district_id_client                  3785 non-null   int64         
 28  gender                              3785 non-null   object        
 29  age                                 3785 non-null   int64         
 30  disp_id                             3785 non-null   int64         
 31  account_id                          3785 non-null   int64         
 32  disposition_type                    3785 non-null   category      
 33  card_id                             691 non-null    float64       
 34  credit_card_type                    691 non-null    category      
 35  issued                              691 non-null    datetime64[ns]
 36  disctrict_id_account                3785 non-null   int64         
 37  frequency                           3785 non-null   category      
 38  date                                3785 non-null   datetime64[ns]
 39  district_name                       3785 non-null   string        
 40  region                              3785 non-null   string        
 41  n_inhabitants                       3785 non-null   int64         
 42  n_municipals_lower_499              3785 non-null   int64         
 43  n_municipals_between_500_1999       3785 non-null   int64         
 44  n_municipals_between_2000_9999      3785 non-null   int64         
 45  n_municipals_higher_10000           3785 non-null   int64         
 46  n_cities                            3785 non-null   int64         
 47  ratio_urban_inhabitants             3785 non-null   float64       
 48  avg_salary                          3785 non-null   int64         
 49  unemployment_rate_1995              3741 non-null   float64       
 50  unemployment_rate_1996              3785 non-null   float64       
 51  n_enterpreneurs_per_1k_inhabitants  3785 non-null   int64         
 52  n_crimes_1995                       3741 non-null   float64       
 53  n_crimes_1996                       3785 non-null   int64         
 54  has_cc                              3785 non-null   int64         
dtypes: category(3), datetime64[ns](4), float64(21), int64(19), object(6), string(2)
memory usage: 1.5+ MB

combined_df.columns

Index(['client_id', 'total_spent', 'num_transactions', 'avg_balance',
       'max_balance', 'min_balance', 'std_balance', 'avg_trans_amount',
       'med_trans_amount', 'max_trans_amount', 'min_trans_amount',
       'std_trans_amount', 'balance_before_cc', 'transaction_type',
       'transaction_operation', 'trans_k_symbol', 'num_loans', 'loan_amount',
       'loan_duration', 'loan_payments', 'loan_status', 'num_perm_orders',
       'total_order_amount', 'avg_order_amount', 'order_k_symbol', 'month',
       'birth_number', 'district_id_client', 'gender', 'age', 'disp_id',
       'account_id', 'disposition_type', 'card_id', 'credit_card_type',
       'issued', 'disctrict_id_account', 'frequency', 'date', 'district_name',
       'region', 'n_inhabitants', 'n_municipals_lower_499',
       'n_municipals_between_500_1999', 'n_municipals_between_2000_9999',
       'n_municipals_higher_10000', 'n_cities', 'ratio_urban_inhabitants',
       'avg_salary', 'unemployment_rate_1995', 'unemployment_rate_1996',
       'n_enterpreneurs_per_1k_inhabitants', 'n_crimes_1995', 'n_crimes_1996',
       'has_cc'],
      dtype='object')

combined_df.rename(columns={
    'district_name': 'client_district_name',
    'region': 'client_region',
    'n_inhabitants': 'client_n_inhabitants',
    'n_municipals_lower_499': 'client_n_municipals_lower_499',
    'n_municipals_between_500_1999': 'client_n_municipals_between_500_1999',
    'n_municipals_between_2000_9999': 'client_n_municipals_between_2000_9999',
    'n_municipals_higher_10000': 'client_n_municipals_higher_10000',
    'n_cities': 'client_n_cities',
    'ratio_urban_inhabitants': 'client_ratio_urban_inhabitants',
    'avg_salary': 'client_avg_salary',
    'unemployment_rate_1995': 'client_unemployment_rate_1995',
    'unemployment_rate_1996': 'client_unemployment_rate_1996',
    'n_enterpreneurs_per_1k_inhabitants': 'client_n_enterpreneurs_per_1k_inhabitants',
    'n_crimes_1995': 'client_n_crimes_1995',
    'n_crimes_1996': 'client_n_crimes_1996'
}, inplace=True)

combined_df.columns

Index(['client_id', 'total_spent', 'num_transactions', 'avg_balance',
       'max_balance', 'min_balance', 'std_balance', 'avg_trans_amount',
       'med_trans_amount', 'max_trans_amount', 'min_trans_amount',
       'std_trans_amount', 'balance_before_cc', 'transaction_type',
       'transaction_operation', 'trans_k_symbol', 'num_loans', 'loan_amount',
       'loan_duration', 'loan_payments', 'loan_status', 'num_perm_orders',
       'total_order_amount', 'avg_order_amount', 'order_k_symbol', 'month',
       'birth_number', 'district_id_client', 'gender', 'age', 'disp_id',
       'account_id', 'disposition_type', 'card_id', 'credit_card_type',
       'issued', 'disctrict_id_account', 'frequency', 'date',
       'client_district_name', 'client_region', 'client_n_inhabitants',
       'client_n_municipals_lower_499', 'client_n_municipals_between_500_1999',
       'client_n_municipals_between_2000_9999',
       'client_n_municipals_higher_10000', 'client_n_cities',
       'client_ratio_urban_inhabitants', 'client_avg_salary',
       'client_unemployment_rate_1995', 'client_unemployment_rate_1996',
       'client_n_enterpreneurs_per_1k_inhabitants', 'client_n_crimes_1995',
       'client_n_crimes_1996', 'has_cc'],
      dtype='object')

combined_df.isna().sum().sort_values(ascending=False) # Anzahl der fehlenden Werte pro Spalte

credit_card_type                             3094
issued                                       3094
card_id                                      3094
client_n_crimes_1995                           44
client_unemployment_rate_1995                  44
total_spent                                     0
client_id                                       0
avg_trans_amount                                0
med_trans_amount                                0
max_trans_amount                                0
min_trans_amount                                0
std_trans_amount                                0
balance_before_cc                               0
transaction_type                                0
transaction_operation                           0
trans_k_symbol                                  0
num_loans                                       0
loan_amount                                     0
num_transactions                                0
avg_balance                                     0
max_balance                                     0
min_balance                                     0
std_balance                                     0
total_order_amount                              0
num_perm_orders                                 0
loan_status                                     0
loan_payments                                   0
loan_duration                                   0
order_k_symbol                                  0
district_id_client                              0
avg_order_amount                                0
disp_id                                         0
age                                             0
gender                                          0
disposition_type                                0
account_id                                      0
disctrict_id_account                            0
month                                           0
birth_number                                    0
date                                            0
frequency                                       0
client_district_name                            0
client_region                                   0
client_n_municipals_between_500_1999            0
client_n_municipals_between_2000_9999           0
client_n_inhabitants                            0
client_n_municipals_lower_499                   0
client_n_cities                                 0
client_n_municipals_higher_10000                0
client_avg_salary                               0
client_ratio_urban_inhabitants                  0
client_unemployment_rate_1996                   0
client_n_enterpreneurs_per_1k_inhabitants       0
client_n_crimes_1996                            0
has_cc                                          0
dtype: int64

combined_df.duplicated().sum()

np.int64(0)

combined_df.columns

Index(['client_id', 'total_spent', 'num_transactions', 'avg_balance',
       'max_balance', 'min_balance', 'std_balance', 'avg_trans_amount',
       'med_trans_amount', 'max_trans_amount', 'min_trans_amount',
       'std_trans_amount', 'balance_before_cc', 'transaction_type',
       'transaction_operation', 'trans_k_symbol', 'num_loans', 'loan_amount',
       'loan_duration', 'loan_payments', 'loan_status', 'num_perm_orders',
       'total_order_amount', 'avg_order_amount', 'order_k_symbol', 'month',
       'birth_number', 'district_id_client', 'gender', 'age', 'disp_id',
       'account_id', 'disposition_type', 'card_id', 'credit_card_type',
       'issued', 'disctrict_id_account', 'frequency', 'date',
       'client_district_name', 'client_region', 'client_n_inhabitants',
       'client_n_municipals_lower_499', 'client_n_municipals_between_500_1999',
       'client_n_municipals_between_2000_9999',
       'client_n_municipals_higher_10000', 'client_n_cities',
       'client_ratio_urban_inhabitants', 'client_avg_salary',
       'client_unemployment_rate_1995', 'client_unemployment_rate_1996',
       'client_n_enterpreneurs_per_1k_inhabitants', 'client_n_crimes_1995',
       'client_n_crimes_1996', 'has_cc'],
      dtype='object')

final_df = combined_df.drop(columns=['birth_number', 
                                     'account_id', 
                                     'disp_id', 
                                     'district_id_client', 
                                     'client_id', 
                                     'disposition_type', 
                                     'disctrict_id_account', 
                                     'card_id', 
                                     'credit_card_type', 
                                     'issued'
                                     ])

final_df.columns

Index(['total_spent', 'num_transactions', 'avg_balance', 'max_balance',
       'min_balance', 'std_balance', 'avg_trans_amount', 'med_trans_amount',
       'max_trans_amount', 'min_trans_amount', 'std_trans_amount',
       'balance_before_cc', 'transaction_type', 'transaction_operation',
       'trans_k_symbol', 'num_loans', 'loan_amount', 'loan_duration',
       'loan_payments', 'loan_status', 'num_perm_orders', 'total_order_amount',
       'avg_order_amount', 'order_k_symbol', 'month', 'gender', 'age',
       'frequency', 'date', 'client_district_name', 'client_region',
       'client_n_inhabitants', 'client_n_municipals_lower_499',
       'client_n_municipals_between_500_1999',
       'client_n_municipals_between_2000_9999',
       'client_n_municipals_higher_10000', 'client_n_cities',
       'client_ratio_urban_inhabitants', 'client_avg_salary',
       'client_unemployment_rate_1995', 'client_unemployment_rate_1996',
       'client_n_enterpreneurs_per_1k_inhabitants', 'client_n_crimes_1995',
       'client_n_crimes_1996', 'has_cc'],
      dtype='object')

final_df.shape

(3785, 45)

final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3785 entries, 0 to 3784
Data columns (total 45 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   total_spent                                3785 non-null   float64       
 1   num_transactions                           3785 non-null   int64         
 2   avg_balance                                3785 non-null   float64       
 3   max_balance                                3785 non-null   float64       
 4   min_balance                                3785 non-null   float64       
 5   std_balance                                3785 non-null   float64       
 6   avg_trans_amount                           3785 non-null   float64       
 7   med_trans_amount                           3785 non-null   float64       
 8   max_trans_amount                           3785 non-null   float64       
 9   min_trans_amount                           3785 non-null   float64       
 10  std_trans_amount                           3785 non-null   float64       
 11  balance_before_cc                          3785 non-null   float64       
 12  transaction_type                           3785 non-null   object        
 13  transaction_operation                      3785 non-null   object        
 14  trans_k_symbol                             3785 non-null   object        
 15  num_loans                                  3785 non-null   int64         
 16  loan_amount                                3785 non-null   float64       
 17  loan_duration                              3785 non-null   float64       
 18  loan_payments                              3785 non-null   float64       
 19  loan_status                                3785 non-null   object        
 20  num_perm_orders                            3785 non-null   int64         
 21  total_order_amount                         3785 non-null   float64       
 22  avg_order_amount                           3785 non-null   float64       
 23  order_k_symbol                             3785 non-null   object        
 24  month                                      3785 non-null   datetime64[ns]
 25  gender                                     3785 non-null   object        
 26  age                                        3785 non-null   int64         
 27  frequency                                  3785 non-null   category      
 28  date                                       3785 non-null   datetime64[ns]
 29  client_district_name                       3785 non-null   string        
 30  client_region                              3785 non-null   string        
 31  client_n_inhabitants                       3785 non-null   int64         
 32  client_n_municipals_lower_499              3785 non-null   int64         
 33  client_n_municipals_between_500_1999       3785 non-null   int64         
 34  client_n_municipals_between_2000_9999      3785 non-null   int64         
 35  client_n_municipals_higher_10000           3785 non-null   int64         
 36  client_n_cities                            3785 non-null   int64         
 37  client_ratio_urban_inhabitants             3785 non-null   float64       
 38  client_avg_salary                          3785 non-null   int64         
 39  client_unemployment_rate_1995              3741 non-null   float64       
 40  client_unemployment_rate_1996              3785 non-null   float64       
 41  client_n_enterpreneurs_per_1k_inhabitants  3785 non-null   int64         
 42  client_n_crimes_1995                       3741 non-null   float64       
 43  client_n_crimes_1996                       3785 non-null   int64         
 44  has_cc                                     3785 non-null   int64         
dtypes: category(1), datetime64[ns](2), float64(20), int64(14), object(6), string(2)
memory usage: 1.3+ MB

# 1) Aufteilung der Daten in Trainings- und Testsets
# - 80% Training, 20% Test
# - stratified, um Klassenverteilung in Zielvariable beizubehalten
target = "has_cc"                      # Zielvariable: Kreditkartenbesitz
X = final_df.drop(columns=[target])   # Features ohne Zielvariable
y = final_df[target]                   # Zielvariable

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(X_train.columns)  # Spaltenübersicht der Trainingsdaten

Index(['total_spent', 'num_transactions', 'avg_balance', 'max_balance',
       'min_balance', 'std_balance', 'avg_trans_amount', 'med_trans_amount',
       'max_trans_amount', 'min_trans_amount', 'std_trans_amount',
       'balance_before_cc', 'transaction_type', 'transaction_operation',
       'trans_k_symbol', 'num_loans', 'loan_amount', 'loan_duration',
       'loan_payments', 'loan_status', 'num_perm_orders', 'total_order_amount',
       'avg_order_amount', 'order_k_symbol', 'month', 'gender', 'age',
       'frequency', 'date', 'client_district_name', 'client_region',
       'client_n_inhabitants', 'client_n_municipals_lower_499',
       'client_n_municipals_between_500_1999',
       'client_n_municipals_between_2000_9999',
       'client_n_municipals_higher_10000', 'client_n_cities',
       'client_ratio_urban_inhabitants', 'client_avg_salary',
       'client_unemployment_rate_1995', 'client_unemployment_rate_1996',
       'client_n_enterpreneurs_per_1k_inhabitants', 'client_n_crimes_1995',
       'client_n_crimes_1996'],
      dtype='object')

# --- Fehlende Werte in Trainingsdaten prüfen ---
print("Fehlende Werte im TRAININGSDATENSATZ:")

# Gesamtzahl der fehlenden Werte und prozentualer Anteil an allen Zellen
total_nans_train = X_train.isnull().sum().sum()
total_cells_train = X_train.shape[0] * X_train.shape[1]
nan_percentage_train = (total_nans_train / total_cells_train) * 100
print(f"Fehlende Werte: {total_nans_train} von {total_cells_train} Zellen ({nan_percentage_train:.2f}%)")

# Detaillierte Übersicht: Anzahl und Prozentsatz fehlender Werte je Spalte, sortiert nach Anteil
nan_summary_train = X_train.isnull().sum().to_frame(name="NaN Anzahl")
nan_summary_train["Gesamt"] = len(X_train)
nan_summary_train["NaN %"] = (nan_summary_train["NaN Anzahl"] / nan_summary_train["Gesamt"]) * 100
nan_summary_train = nan_summary_train[nan_summary_train["NaN Anzahl"] > 0]
nan_summary_train = nan_summary_train.sort_values("NaN %", ascending=False)
print(nan_summary_train)

# --- Fehlende Werte in Testdaten prüfen ---
print("\nFehlende Werte im TESTDATENSATZ:")

total_nans_test = X_test.isnull().sum().sum()
total_cells_test = X_test.shape[0] * X_test.shape[1]
nan_percentage_test = (total_nans_test / total_cells_test) * 100
print(f"Fehlende Werte: {total_nans_test} von {total_cells_test} Zellen ({nan_percentage_test:.2f}%)")

nan_summary_test = X_test.isnull().sum().to_frame(name="NaN Anzahl")
nan_summary_test["Gesamt"] = len(X_test)
nan_summary_test["NaN %"] = (nan_summary_test["NaN Anzahl"] / nan_summary_test["Gesamt"]) * 100
nan_summary_test = nan_summary_test[nan_summary_test["NaN Anzahl"] > 0]
nan_summary_test = nan_summary_test.sort_values("NaN %", ascending=False)
print(nan_summary_test)

Fehlende Werte im TRAININGSDATENSATZ:
Fehlende Werte: 56 von 133232 Zellen (0.04%)
                               NaN Anzahl  Gesamt  NaN %
client_unemployment_rate_1995          28    3028   0.92
client_n_crimes_1995                   28    3028   0.92

Fehlende Werte im TESTDATENSATZ:
Fehlende Werte: 32 von 33308 Zellen (0.10%)
                               NaN Anzahl  Gesamt  NaN %
client_unemployment_rate_1995          16     757   2.11
client_n_crimes_1995                   16     757   2.11

cols_to_impute = ["client_unemployment_rate_1995", "client_n_crimes_1995"]

# Median nur aus Training berechnen
medians = X_train[cols_to_impute].median()

# Auf beide Datensätze anwenden
X_train[cols_to_impute] = X_train[cols_to_impute].fillna(medians)
X_test[cols_to_impute]  = X_test[cols_to_impute].fillna(medians)

print("Fehlende Werte im TRAININGSDATENSATZ:")

total_nans_train = X_train.isnull().sum().sum()
total_cells_train = X_train.shape[0] * X_train.shape[1]
nan_percentage_train = (total_nans_train / total_cells_train) * 100

print(f"Fehlende Werte: {total_nans_train} von {total_cells_train} Zellen ({nan_percentage_train:.2f}%)")

nan_summary_train = X_train.isnull().sum().to_frame(name="NaN Anzahl")
nan_summary_train["Gesamt"] = len(X_train)
nan_summary_train["NaN %"] = (nan_summary_train["NaN Anzahl"] / nan_summary_train["Gesamt"]) * 100
nan_summary_train = nan_summary_train[nan_summary_train["NaN Anzahl"] > 0]
nan_summary_train = nan_summary_train.sort_values("NaN %", ascending=False)

print(nan_summary_train)


print("\nFehlende Werte im TESTDATENSATZ:")

total_nans_test = X_test.isnull().sum().sum()
total_cells_test = X_test.shape[0] * X_test.shape[1]
nan_percentage_test = (total_nans_test / total_cells_test) * 100

print(f"Fehlende Werte: {total_nans_test} von {total_cells_test} Zellen ({nan_percentage_test:.2f}%)")

nan_summary_test = X_test.isnull().sum().to_frame(name="NaN Anzahl")
nan_summary_test["Gesamt"] = len(X_test)
nan_summary_test["NaN %"] = (nan_summary_test["NaN Anzahl"] / nan_summary_test["Gesamt"]) * 100
nan_summary_test = nan_summary_test[nan_summary_test["NaN Anzahl"] > 0]
nan_summary_test = nan_summary_test.sort_values("NaN %", ascending=False)

print(nan_summary_test)

Fehlende Werte im TRAININGSDATENSATZ:
Fehlende Werte: 0 von 133232 Zellen (0.00%)
Empty DataFrame
Columns: [NaN Anzahl, Gesamt, NaN %]
Index: []

Fehlende Werte im TESTDATENSATZ:
Fehlende Werte: 0 von 33308 Zellen (0.00%)
Empty DataFrame
Columns: [NaN Anzahl, Gesamt, NaN %]
Index: []

print(X_train.columns)

Index(['total_spent', 'num_transactions', 'avg_balance', 'max_balance',
       'min_balance', 'std_balance', 'avg_trans_amount', 'med_trans_amount',
       'max_trans_amount', 'min_trans_amount', 'std_trans_amount',
       'balance_before_cc', 'transaction_type', 'transaction_operation',
       'trans_k_symbol', 'num_loans', 'loan_amount', 'loan_duration',
       'loan_payments', 'loan_status', 'num_perm_orders', 'total_order_amount',
       'avg_order_amount', 'order_k_symbol', 'month', 'gender', 'age',
       'frequency', 'date', 'client_district_name', 'client_region',
       'client_n_inhabitants', 'client_n_municipals_lower_499',
       'client_n_municipals_between_500_1999',
       'client_n_municipals_between_2000_9999',
       'client_n_municipals_higher_10000', 'client_n_cities',
       'client_ratio_urban_inhabitants', 'client_avg_salary',
       'client_unemployment_rate_1995', 'client_unemployment_rate_1996',
       'client_n_enterpreneurs_per_1k_inhabitants', 'client_n_crimes_1995',
       'client_n_crimes_1996'],
      dtype='object')

# Alle Zeilen in Ausgaben erlauben
pd.set_option('display.max_rows', None)

display(
    X_train.dtypes
           .reset_index()
           .rename(columns={'index': 'column', 0: 'dtype'})
)

baseline_features = ['age', 'gender', 'client_region', 'total_spent', 'balance_before_cc']

# 1) Aufteilen der Features in numerische und kategoriale Spalten
#    Numerische Spalten werden standardisiert, kategoriale one-hot-kodiert
num_cols = ['age', 'total_spent', 'balance_before_cc']
cat_cols = ['gender', 'client_region']

# 2) Aufbau des Preprocessing mit ColumnTransformer
#    Numerische Features: StandardScaler (Mittelwert 0, Varianz 1)
#    Kategoriale Features: OneHotEncoder mit Drop-First, um Multikollinearität zu vermeiden
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(drop='first'), cat_cols)
])

# 3) Definition von drei Modell-Pipelines mit unterschiedlichen Methoden zur Behandlung von Klassenungleichgewicht

# Baseline mit LogisticRegression und class_weight='balanced' zur Gewichtung seltener Klassen
baseline_pipeline = ImbPipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(
        max_iter=10_000,
        class_weight='balanced',
        solver='lbfgs',
        random_state=42
    ))
])

# Pipeline mit SMOTE (Synthetic Minority Over-sampling Technique) zur künstlichen Erzeugung von Minderheitsklasse
pipe_smote = ImbPipeline(steps=[
    ('preprocessing', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(
        max_iter=10_000,
        solver='lbfgs',
        random_state=42
    ))
])

# Pipeline mit RandomUnderSampler zur Reduktion der Mehrheitsklasse
pipe_undersample = ImbPipeline(steps=[
    ('preprocessing', preprocessor),
    ('undersample', RandomUnderSampler(random_state=42)),
    ('classifier', LogisticRegression(
        max_iter=10_000,
        solver='lbfgs',
        random_state=42
    ))
])

# 4) Cross-Validation mit stratified KFold (5 Splits), um die Klassenzusammensetzung stabil zu halten
#    Evaluierung mit mehreren Metriken: accuracy, precision, recall, f1, roc_auc
pipelines = {
    "Baseline (class_weight)": baseline_pipeline,
    "SMOTE": pipe_smote,
    "Undersampling": pipe_undersample
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cv_results = {}

for name, pipe in pipelines.items():
    # Cross-Validation für jedes Pipeline-Modell durchführen
    scores = cross_validate(pipe, X_train[num_cols + cat_cols], y_train,
                            cv=cv, scoring=scoring, n_jobs=-1)
    # Mittelwerte der Metriken sammeln
    cv_results[name] = {m: scores[f'test_{m}'].mean() for m in scoring}

# 5) Ergebnisse in einem DataFrame übersichtlich zusammenfassen und auf drei Dezimalstellen runden
results_df = pd.DataFrame(cv_results).T.round(3)

# Tabelle anzeigen
display(results_df)

#Fit auf gesamtem Trainingsdatensatz:
baseline_pipeline.fit(X_train[baseline_features], y_train)

# Vorhersagen & Wahrscheinlichkeiten
y_pred_test = baseline_pipeline.predict(X_test[baseline_features])
y_proba_test = baseline_pipeline.predict_proba(X_test[baseline_features])[:, 1]

# Klassifikationsbericht
print("\n=== Klassifikationsbericht (Test-Set) – Baseline Logistic Regression (Pipeline) ===")
print(classification_report(y_test, y_pred_test, digits=3))

# ROC-AUC
roc_auc = roc_auc_score(y_test, y_proba_test)
print(f"ROC-AUC (Test-Set): {roc_auc:.3f}")

=== Klassifikationsbericht (Test-Set) – Baseline Logistic Regression (Pipeline) ===
              precision    recall  f1-score   support

           0      0.924     0.790     0.852       619
           1      0.430     0.710     0.536       138

    accuracy                          0.775       757
   macro avg      0.677     0.750     0.694       757
weighted avg      0.834     0.775     0.794       757

ROC-AUC (Test-Set): 0.837

# Confusion Matrix – Test-Set

cm_test = confusion_matrix(y_test, y_pred_test, normalize='true')

plt.figure(figsize=(6, 4))
sns.heatmap(cm_test, annot=True, fmt=".2f", cmap="viridis")
plt.title("Confusion Matrix (Test-Set) – Baseline Logistic Regression")
plt.xlabel("Vorhergesagt")
plt.ylabel("Tatsächlich")
plt.tight_layout()
plt.show()

# ROC-Kurve Werte berechnen
fpr, tpr, thresholds = roc_curve(y_test, y_proba_test)

plt.plot(fpr, tpr, label="ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve mit markierten Thresholds")

# Schwellenwerte einzeichnen
for i in range(0, len(thresholds), len(thresholds)//10):
    plt.annotate(f"{thresholds[i]:.2f}", (fpr[i], tpr[i]))

plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.legend()
plt.grid(True)
plt.show()

# Trainingsdaten auf kontinuierliche numerische Spalten beschränken
#    (Float- und Integer-Spalten, keine kategorischen oder Dummy-Variablen)

num_base_cols = (
    X_train
      .select_dtypes(include=["float64", "int64"])
      .columns
)
X_vif = X_train[num_base_cols].copy()

# Spalten mit konstanter Varianz (Standardabweichung 0) entfernen
X_vif = X_vif.loc[:, X_vif.std() > 0]

def calculate_vif_matrix(X):
    """Berechnet den Variance Inflation Factor (VIF) für alle Spalten in X."""
    vif_data = []
    for i in range(X.shape[1]):
        try:
            vif_val = variance_inflation_factor(X.values, i)
        except Exception:
            vif_val = np.nan
        vif_data.append(vif_val)
    return pd.DataFrame({
        "feature": X.columns,
        "VIF": vif_data
    })

def stepwise_vif_reduction(X_df, threshold=10.0, verbose=True, plot_progress=True):
    """
    Iterative Reduktion von Features basierend auf VIF.
    Entfernt jeweils die Variable mit dem höchsten VIF über dem Schwellenwert.

    Rückgabe:
        dict mit initialem und finalem VIF, entfernten Features, Verlauf und Vergleich.
    """
    # 1. Datenkopie anlegen und Liste für entfernte Features initialisieren
    X = X_df.copy()
    removed = []
    vif_history = []

    # 2. Nur numerische Spalten behalten und konstante Spalten entfernen
    X = X.select_dtypes(include="number")
    X = X.loc[:, X.std() > 0]

    # 3. VIF vor Reduktion berechnen
    initial_vif = calculate_vif_matrix(X)

    while True:
        vif_df = calculate_vif_matrix(X)
        max_vif = vif_df["VIF"].max()
        mean_vif = vif_df["VIF"].mean()
        num_critical = (vif_df["VIF"] > threshold).sum()

        # Verlauf speichern
        vif_history.append({
            "iteration": len(removed),
            "max_vif": max_vif,
            "mean_vif": mean_vif,
            "num_critical": num_critical
        })

        # Stoppen, wenn alle VIF-Werte unter dem Schwellenwert liegen
        if max_vif <= threshold:
            break

        # Feature mit höchstem VIF entfernen
        drop_feature = vif_df.sort_values("VIF", ascending=False).iloc[0]["feature"]
        removed.append(drop_feature)
        if verbose:
            print(f"Entferne '{drop_feature}' mit VIF={max_vif:.2f}")
        X = X.drop(columns=[drop_feature])

    # 4. Finalen VIF berechnen und Vergleich mit initialem erstellen
    final_vif = calculate_vif_matrix(X)
    comparison = pd.merge(
        initial_vif, final_vif, on="feature", how="inner", suffixes=("_before", "_after")
    )
    comparison["ΔVIF"] = comparison["VIF_after"] - comparison["VIF_before"]

    # 5. Optional: Verlauf der VIF-Werte plotten
    if plot_progress:
        progress_df = pd.DataFrame(vif_history)
        plt.figure(figsize=(8, 5))
        plt.plot(progress_df["iteration"], progress_df["max_vif"], label="Max VIF")
        plt.plot(progress_df["iteration"], progress_df["mean_vif"], label="Mean VIF")
        plt.xlabel("Iteration")
        plt.ylabel("VIF")
        plt.title("VIF-Reduktionsverlauf")
        plt.grid(True)
        plt.legend()
        plt.tight_layout()
        plt.show()

    return {
        "initial_vif": initial_vif.sort_values("VIF", ascending=False).reset_index(drop=True),
        "final_vif": final_vif.sort_values("VIF", ascending=False).reset_index(drop=True),
        "removed_features": removed,
        "comparison": comparison.sort_values("ΔVIF", ascending=False),
        "vif_progress": pd.DataFrame(vif_history),
        "n_initial": initial_vif.shape[0],
        "n_final": final_vif.shape[0],
        "n_removed": len(removed)
    }

# Stepwise-VIF-Reduktion
vif_result = stepwise_vif_reduction(
    X_vif,
    threshold=10.0,
    verbose=True,
    plot_progress=True           # Kurve wird angezeigt
)

Entferne 'client_n_crimes_1995' mit VIF=4929.39
Entferne 'client_n_inhabitants' mit VIF=470.50
Entferne 'client_avg_salary' mit VIF=228.95
Entferne 'max_balance' mit VIF=162.35
Entferne 'client_unemployment_rate_1996' mit VIF=80.72
Entferne 'std_trans_amount' mit VIF=63.68
Entferne 'avg_balance' mit VIF=49.10
Entferne 'client_n_enterpreneurs_per_1k_inhabitants' mit VIF=48.08
Entferne 'num_loans' mit VIF=28.66
Entferne 'std_balance' mit VIF=27.35
Entferne 'client_ratio_urban_inhabitants' mit VIF=23.65
Entferne 'total_order_amount' mit VIF=16.01
Entferne 'balance_before_cc' mit VIF=14.56
Entferne 'max_trans_amount' mit VIF=13.95
Entferne 'client_n_cities' mit VIF=11.11
Entferne 'num_perm_orders' mit VIF=10.06

print(f"Ursprüngliche Anzahl Features: {vif_result['n_initial']}")
print(f"Verbleibende Features:         {vif_result['n_final']}")
print(f"Entfernte Features:            {vif_result['n_removed']}")

Ursprüngliche Anzahl Features: 33
Verbleibende Features:         17
Entfernte Features:            16

print("\nVergleichstabelle:")
vif_result["comparison"].sort_values("ΔVIF", ascending=False)

Vergleichstabelle:

# Extrahieren der reduzierten Featureliste aus dem VIF-Ergebnis für die Modellbildung
num_reduced = vif_result["final_vif"]["feature"].tolist()

# Kontrolle, ob nur numerische Spalten in den reduzierten Features sind
non_num_cols = [col for col in num_reduced if not np.issubdtype(X_train[col].dtype, np.number)]
print("Nicht-numerische Spalten im VIF-Ergebnis:", non_num_cols)

# Kategorische Spalten im Trainingsdatensatz ermitteln
cat_cols = X_train.select_dtypes(include=["object", "string", "category"]).columns.tolist()

# --------------------------------------------------
# Definition des Preprocessing mit ColumnTransformer
#    Numerische Features: StandardScaler (Median-Impute optional ergänzt)
#    Kategorische Features: OneHotEncoder mit Behandlung unbekannter Kategorien, Drop-First zur Vermeidung von Dummy-Variablen-Falle
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("sc", StandardScaler())
        ]), num_reduced),

        ("cat", OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False), cat_cols)
    ]
)

# --------------------------------------------------
# Pipeline mit Preprocessing und Logistischer Regression (ausgeglichen durch class_weight)
baseline_pipeline_reduced = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", LogisticRegression(
        max_iter=10_000,
        class_weight="balanced",
        solver="lbfgs",
        random_state=42
    ))
])

Nicht-numerische Spalten im VIF-Ergebnis: []

# Transformation der Trainingsdaten mit dem Preprocessing-Teil der Pipeline
X_train_transformed = baseline_pipeline_reduced.named_steps['prep'].fit_transform(X_train)

# Numerische Feature-Namen aus der reduzierten Featureliste
num_features = num_reduced

# One-Hot-kodierte kategorische Feature-Namen aus dem ColumnTransformer extrahieren
cat_features = baseline_pipeline_reduced.named_steps['prep']\
    .named_transformers_['cat'].get_feature_names_out(cat_cols)

# Alle Feature-Namen zusammenfügen (numerisch + kategorisch)
all_features = list(num_features) + list(cat_features)

# DataFrame mit den transformierten Trainingsdaten und passenden Spaltennamen erstellen
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=all_features)

# Anzeigeoption einstellen, um alle Spalten im DataFrame sichtbar zu machen
pd.set_option('display.max_columns', None)

# DataFrame mit den ersten fünf Zeilen anzeigen
display(X_train_transformed_df.head())

# Cross-Validation

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ["accuracy", "precision", "recall", "f1", "roc_auc"]

cv_result_reduced = cross_validate(
    baseline_pipeline_reduced,
    X_train,                  # <-- jetzt gesamtes X_train (numerisch + kategorial)
    y_train,
    cv=cv,
    scoring=scoring,
    n_jobs=-1
)

print("=== Ø-Scores (5-CV) ===")
for m in scoring:
    print(f"{m:<9s}: {cv_result_reduced[f'test_{m}'].mean():.3f}")

=== Ø-Scores (5-CV) ===
accuracy : 0.760
precision: 0.414
recall   : 0.750
f1       : 0.533
roc_auc  : 0.835

# Liste der zu vergleichenden Metriken
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# Sampling-Modelle in gewünschter Reihenfolge (optional)
model_names = ["Baseline (class_weight)", "SMOTE", "Undersampling"]

# Vergleichstabelle erstellen:
# - Spalte 1: Mittelwerte aus Cross-Validation für das ursprüngliche Baseline-Modell
# - Spalte 2: Mittelwerte für das reduzierte Modell (nach VIF-Selektion)
# - Spalte 3: Differenz der Mittelwerte (Reduced − Baseline), um Verbesserungen oder Einbuss$sen zu erkennen

# Vergleichstabelle erstellen
comparison_df = pd.DataFrame({
    'Baseline (class_weight)': [cv_results["Baseline (class_weight)"][m] for m in metrics],
    'VIF-reduziert (mean)':    [cv_result_reduced[f'test_{m}'].mean() for m in metrics],
    'Δ (VIF − Baseline)': [
        cv_result_reduced[f'test_{m}'].mean() - cv_results["Baseline (class_weight)"][m]
        for m in metrics
    ]
}, index=metrics).round(3)

comparison_df

# 1) Spalten nach Datentypen trennen: numerisch vs. kategorisch
num_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]
cat_cols = [col for col in X_train.columns if X_train[col].dtype in ['object', 'string', 'category']]

# Kategorische Spalten als String casten (für Pipeline-Kompatibilität)
X_train[cat_cols] = X_train[cat_cols].astype(str)
X_test[cat_cols] = X_test[cat_cols].astype(str)

# 2) Preprocessing Pipeline definieren
# Numerische Spalten: fehlende Werte mit Mittelwert auffüllen, dann skalieren
# Kategorische Spalten: fehlende Werte mit "missing" auffüllen, dann OneHot-Encoding
preprocess = ColumnTransformer([
    ("num", Pipeline([
        ("impute", SimpleImputer(strategy="mean")),
        ("scale", StandardScaler())
    ]), num_cols),
    
    ("cat", Pipeline([
        ("impute", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encode", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
])

# Positives Klassenverhältnis für XGBoost (Wichtung der Minderheitsklasse)
pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# Klassifikatoren definieren mit spezifischen Parametern
rf = RandomForestClassifier(n_estimators=400, class_weight='balanced', random_state=42)

brf = BalancedRandomForestClassifier(n_estimators=400, random_state=42)

xgb_clf = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    scale_pos_weight=pos_weight,
    n_estimators=400,
    random_state=42
)

hgb = HistGradientBoostingClassifier(
    loss="log_loss",
    class_weight="balanced",
    random_state=42
)

# Funktion zum Umwandeln von sparse Matrix in dichten Array (für XGBoost und HGB)
to_dense = FunctionTransformer(lambda X: X.toarray(), accept_sparse=True)

# Pipelines für die verschiedenen Modelle zusammenstellen
pipelines = {
    "Random Forest": Pipeline([
        ("prep", preprocess),
        ("clf", rf)
    ]),
    "Balanced RF": Pipeline([
        ("prep", preprocess),
        ("clf", brf)
    ]),
    "XGBoost": Pipeline([
        ("prep", preprocess),
        ("dense", to_dense),   # Konvertierung nötig für XGBoost
        ("clf", xgb_clf)
    ]),
    "HistGradBoost": Pipeline([
        ("prep", preprocess),
        ("dense", to_dense),   # Konvertierung nötig für HistGradientBoosting
        ("clf", hgb)
    ])
}

# Beispiel: XGBoost-Pipeline auswählen und trainieren (fit)
pipe = pipelines["XGBoost"]
pipe.fit(X_train, y_train)

# Zugriff auf den ColumnTransformer und OneHotEncoder für Kategorische Features
column_transformer = pipe.named_steps["prep"]
ohe = column_transformer.named_transformers_["cat"].named_steps["encode"]

# Ausgabe der Kategorien pro kategorischer Spalte
print("\nKategorische Spalten und erkannte Kategorien:")
for col, cats in zip(cat_cols, ohe.categories_):
    print(f"{col}: {list(cats)}")

Kategorische Spalten und erkannte Kategorien:
transaction_type: ['credit', 'missing', 'withdrawal']
transaction_operation: ['collection_from_another_bank', 'credit_in_cash', 'missing', 'remittance_to_another_bank', 'unknown', 'withdrawal_in_cash']
trans_k_symbol: ['household_payment', 'interest_credited', 'missing', 'oldage_pension', 'payment_for_statement', 'unknown']
loan_status: ['finished_debts', 'finished_ok', 'missing', 'running_debts', 'running_ok']
order_k_symbol: ['household_payment', 'leasing_payment', 'loan_payment', 'missing']
gender: ['F', 'M']
frequency: ['issuance_after_transaction', 'monthly_issuance', 'weekly_issuance']
client_district_name: ['Benesov', 'Beroun', 'Blansko', 'Breclav', 'Brno - mesto', 'Brno - venkov', 'Bruntal', 'Ceska Lipa', 'Ceske Budejovice', 'Cesky Krumlov', 'Cheb', 'Chomutov', 'Chrudim', 'Decin', 'Domazlice', 'Frydek - Mistek', 'Havlickuv Brod', 'Hl.m. Praha', 'Hodonin', 'Hradec Kralove', 'Jablonec n. Nisou', 'Jesenik', 'Jicin', 'Jihlava', 'Jindrichuv Hradec', 'Karlovy Vary', 'Karvina', 'Kladno', 'Klatovy', 'Kolin', 'Kromeriz', 'Kutna Hora', 'Liberec', 'Litomerice', 'Louny', 'Melnik', 'Mlada Boleslav', 'Most', 'Nachod', 'Novy Jicin', 'Nymburk', 'Olomouc', 'Opava', 'Ostrava - mesto', 'Pardubice', 'Pelhrimov', 'Pisek', 'Plzen - jih', 'Plzen - mesto', 'Plzen - sever', 'Prachatice', 'Praha - vychod', 'Praha - zapad', 'Prerov', 'Pribram', 'Prostejov', 'Rakovnik', 'Rokycany', 'Rychnov nad Kneznou', 'Semily', 'Sokolov', 'Strakonice', 'Sumperk', 'Svitavy', 'Tabor', 'Tachov', 'Teplice', 'Trebic', 'Trutnov', 'Uherske Hradiste', 'Usti nad Labem', 'Usti nad Orlici', 'Vsetin', 'Vyskov', 'Zdar nad Sazavou', 'Zlin', 'Znojmo']
client_region: ['Prague', 'central Bohemia', 'east Bohemia', 'north Bohemia', 'north Moravia', 'south Bohemia', 'south Moravia', 'west Bohemia']

# Cross-Validation mit stratified 5-Fold für verschiedene Modelle durchführen
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

results = {}

for name, pipe in pipelines.items():
    # Cross-Validate für jedes Modell mit mehreren Metriken parallel berechnen
    scores = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
    # Mittelwerte der Testergebnisse sammeln
    results[name] = {m: scores[f'test_{m}'].mean() for m in scoring}

# Ergebnisse in DataFrame umwandeln und runden
results_df = pd.DataFrame(results).T.round(3)

# Ergebnis des VIF-reduzierten LogReg-Modells als neue Zeile hinzufügen
reduced_entry = {
    m: cv_result_reduced[f'test_{m}'].mean()
    for m in scoring
}
results_df.loc["LogReg (VIF reduced)"] = reduced_entry

# Ergebnisse anzeigen
results_df = results_df.round(3)
display(results_df)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Plots initialisieren
plt.figure(figsize=(18, 6))

# ROC-Kurve plot erstellen
plt.subplot(1, 3, 1)
for name, pipe in pipelines.items():
    y_proba = cross_val_predict(pipe, X_train, y_train, cv=cv, method='predict_proba', n_jobs=-1)[:, 1]
    fpr, tpr, _ = roc_curve(y_train, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC={roc_auc:.2f})')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Model')
plt.legend()

<matplotlib.legend.Legend at 0x1e104694a50>

# Data vorbereiten: melt() erzeugt ein long-format DataFrame
df_melted = results_df.reset_index().melt(id_vars='index')
df_melted.columns = ['Modell', 'Metrik', 'Wert']

# Plot mit Plotly
fig = px.bar(
    df_melted,
    x="Metrik",
    y="Wert",
    color="Modell",
    barmode="group",
    text_auto='.2f',
    title="Modellvergleich"
)

fig.update_layout(
    yaxis=dict(range=[0, 1]),
    xaxis_title="Metrik",
    yaxis_title="Score",
    legend_title="Modell",
    title_x=0.5
)

fig.show()

# Modellnamen definieren (Baseline + Kandidaten)
model_names = ["LogReg (VIF reduced)", "Random Forest", "Balanced RF", "XGBoost"]

# Dictionary zur Speicherung der trainierten Modelle
trained_models = {}

# Alle Modelle aus dem Pipeline-Dictionary auf Trainingsdaten fitten
for name, pipe in pipelines.items():
    print(f"Fitting Modell: {name}")
    pipe.fit(X_train, y_train)
    trained_models[name] = pipe

# VIF-reduziertes LogReg-Modell separat fitten
print("Fitting LogReg (VIF reduced)")
baseline_pipeline_reduced.fit(X_train[num_reduced + cat_cols], y_train)
trained_models["LogReg (VIF reduced)"] = baseline_pipeline_reduced

# Analyse der Überlappung der Top-N Kunden (z.B. Top 5% und 10%)
top_percentages = [0.05, 0.10]  # 5% und 10%
overlap_results = []

for p in top_percentages:
    n_top = int(len(X_test) * p)
    print(f"\nTop {int(p*100)}% Kunden: jeweils {n_top} Kunden")

    top_customers = {}
    for name, model in trained_models.items():
        # Für LogReg reduzierte Features verwenden, sonst alle Features
        if name == "LogReg (VIF reduced)":
            proba = model.predict_proba(X_test[num_reduced + cat_cols])[:, 1]
        else:
            proba = model.predict_proba(X_test)[:, 1]

        top_idx = np.argsort(proba)[-n_top:]
        top_customers[name] = set(X_test.iloc[top_idx].index)

    # Jaccard-Index und Schnittmenge für alle Modellpaare berechnen
    for i, name1 in enumerate(model_names):
        for j, name2 in enumerate(model_names):
            if j <= i:
                continue
            set1 = top_customers[name1]
            set2 = top_customers[name2]
            intersection = len(set1.intersection(set2))
            union = len(set1.union(set2))
            jaccard = intersection / union if union > 0 else 0
            overlap_results.append({
                "Top (%)": int(p*100),
                "Modell 1": name1,
                "Modell 2": name2,
                "Intersection": intersection,
                "Jaccard": jaccard
            })

# DataFrame mit den Überlappungs-Metriken erstellen
overlap_df = pd.DataFrame(overlap_results)

# Funktion zur Hervorhebung des Jaccard-Werts mit Farben
def highlight_jaccard(val):
    if val >= 0.8:
        return 'background-color: #2ecc71; color: white;'  # Grün: sehr hoch
    elif val >= 0.6:
        return 'background-color: #27ae60; color: white;'  # Dunkelgrün
    elif val >= 0.4:
        return 'background-color: #f1c40f; color: black;'  # Gelb: mittel
    elif val >= 0.2:
        return 'background-color: #e67e22; color: black;'  # Orange: niedrig
    else:
        return 'background-color: #e74c3c; color: white;'  # Rot: sehr niedrig

# Tabelle mit absoluten Top-N Kunden pro Prozentwert erstellen
n_total = len(X_test)
cutoff_info = pd.DataFrame({
    "Top (%)": [int(p*100) for p in top_percentages],
    "Top N"  : [int(n_total*p) for p in top_percentages]
})
print(cutoff_info)

# Gestylte Tabelle mit farblicher Hervorhebung der Jaccard-Index-Werte anzeigen
styled_df = overlap_df.style.format({"Jaccard": "{:.2f}"}).applymap(highlight_jaccard, subset=['Jaccard'])
display(styled_df)

Fitting Modell: Random Forest
Fitting Modell: Balanced RF
Fitting Modell: XGBoost
Fitting Modell: HistGradBoost
Fitting LogReg (VIF reduced)

Top 5% Kunden: jeweils 37 Kunden

Top 10% Kunden: jeweils 75 Kunden
   Top (%)  Top N
0        5     37
1       10     75

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Mehrere Metriken für RandomizedSearchCV definieren
scoring = {
    "f1": "f1",            # Optimiert nach F1-Score
    "roc_auc": "roc_auc",  # Auch ROC-AUC als wichtige Metrik
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall"
}

# 1) Parameterraum für BalancedRandomForestClassifier
param_dist = {
    "clf__n_estimators"     : [25, 50, 100, 200, 300, 400, 500],
    "clf__max_depth"        : [None, 5, 10, 15, 20],
    "clf__min_samples_split": randint(2, 11),
    "clf__min_samples_leaf" : randint(1, 11),
    "clf__max_features"     : ["sqrt", "log2", None, 0.3, 0.5, 0.7],
    "clf__bootstrap"        : [True, False],
    "clf__criterion"        : ["gini", "entropy"],
    "clf__sampling_strategy": ["auto", 0.5, 0.75],
    "clf__replacement"      : [True, False],
}

# 2) Pipeline mit Preprocessing + BalancedRandomForestClassifier
brf_pipe = Pipeline([
    ("prep", preprocess),
    ("clf", BalancedRandomForestClassifier(random_state=42, n_jobs=1))
])

# 3) RandomizedSearchCV konfigurieren und fitten
brf_search = RandomizedSearchCV(
    estimator=brf_pipe,
    param_distributions=param_dist,
    n_iter=100,               # Anzahl zufälliger Parameterkombinationen
    scoring=scoring,          # Mehrere Metriken zur Evaluierung
    refit="roc_auc",              # Nach F1 Score wird das beste Modell ausgewählt
    cv=cv,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

brf_search.fit(X_train, y_train)

# 4) Bestes Modell aus RandomizedSearch extrahieren
best_brf = brf_search.best_estimator_

# 5) Evaluation auf Trainingsdaten
y_train_pred = best_brf.predict(X_train)
y_train_proba = best_brf.predict_proba(X_train)[:, 1]

print("\n=== Klassifikationsbericht (Trainings-Set) ===")
print(classification_report(y_train, y_train_pred, digits=3))
print(f"ROC-AUC (Trainings-Set): {roc_auc_score(y_train, y_train_proba):.3f}")

cm_train = confusion_matrix(y_train, y_train_pred, normalize="true")
plt.figure(figsize=(5, 4))
sns.heatmap(cm_train, annot=True, fmt=".2f", cmap="viridis")
plt.title("Confusion Matrix (Trainings-Set)")
plt.tight_layout()
plt.show()

# 6) Evaluation auf Testdaten
y_test_pred = best_brf.predict(X_test)
y_test_proba = best_brf.predict_proba(X_test)[:, 1]

print("\n=== Klassifikationsbericht (Test-Set) ===")
print(classification_report(y_test, y_test_pred, digits=3))
print(f"ROC-AUC (Test-Set): {roc_auc_score(y_test, y_test_proba):.3f}")

cm_test = confusion_matrix(y_test, y_test_pred, normalize="true")
plt.figure(figsize=(5, 4))
sns.heatmap(cm_test, annot=True, fmt=".2f", cmap="viridis")
plt.title("Confusion Matrix (Test-Set)")
plt.tight_layout()
plt.show()

# 7) Threshold-Tuning: Precision, Recall und F1-Score für verschiedene Entscheidungsschwellen


thresholds = np.linspace(0.1, 0.9, 17)  # Schwellen von 0.1 bis 0.9 in 0.05-Schritten
metrics = []

for t in thresholds:
    y_thresh = (y_test_proba >= t).astype(int)
    precision = precision_score(y_test, y_thresh)
    recall = recall_score(y_test, y_thresh)
    f1 = f1_score(y_test, y_thresh)
    metrics.append((t, precision, recall, f1))

df_thresh = pd.DataFrame(metrics, columns=["threshold", "precision", "recall", "f1"])
display(df_thresh)

df_thresh.set_index("threshold").plot(figsize=(8, 5), marker="o")
plt.title("Precision, Recall und F1-Score in Abhängigkeit vom Threshold")
plt.ylabel("Score")
plt.xlabel("Entscheidungsschwelle")
plt.grid(True)
plt.tight_layout()
plt.show()

# 8) Ausgabe der besten Parameterkombination
print("\nBeste Parameterkombination:")
print(brf_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits

=== Klassifikationsbericht (Trainings-Set) ===
              precision    recall  f1-score   support

           0      1.000     0.783     0.878      2475
           1      0.507     1.000     0.673       553

    accuracy                          0.823      3028
   macro avg      0.754     0.892     0.776      3028
weighted avg      0.910     0.823     0.841      3028

ROC-AUC (Trainings-Set): 1.000

=== Klassifikationsbericht (Test-Set) ===
              precision    recall  f1-score   support

           0      0.978     0.735     0.839       619
           1      0.438     0.928     0.595       138

    accuracy                          0.770       757
   macro avg      0.708     0.831     0.717       757
weighted avg      0.880     0.770     0.795       757

ROC-AUC (Test-Set): 0.886

Beste Parameterkombination:
{'clf__bootstrap': False, 'clf__criterion': 'entropy', 'clf__max_depth': None, 'clf__max_features': 0.7, 'clf__min_samples_leaf': 3, 'clf__min_samples_split': 2, 'clf__n_estimators': 200, 'clf__replacement': False, 'clf__sampling_strategy': 'auto'}

# Berechnung der ROC-Kurve und Schwellenwerte
y_proba = best_brf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
 
plt.figure(figsize=(12, 8))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC={roc_auc:.2f})', color='blue', linewidth=2)
 
# Anzahl der Schwellenwerte, die du anzeigen möchtest
num_thresholds = 7
# Indizes gleichmässig auswählen
indices = np.linspace(0, len(thresholds) - 1, num=num_thresholds, dtype=int)
 
for i in indices:
    # Punkt markieren
    plt.scatter(fpr[i], tpr[i], color='red', s=70, zorder=5)
    # Text etwas oberhalb und rechts vom Punkt
    plt.text(fpr[i] + 0.02, tpr[i] - 0.03, f'{thresholds[i]:.2f}',
             fontsize=12, color='red', fontweight='bold', zorder=6)
 
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title("ROC-Kurve (Test-Set) mit Schwellenwerten", fontsize=16)
 
plt.legend(loc="lower right", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Bestes Modell aus RandomizedSearchCV auswählen
best_model = brf_search.best_estimator_

# Permutation Feature Importance (PFI) berechnen:
# Messen, wie stark die Modellleistung (ROC-AUC) bei zufälligem Permutieren eines Features sinkt
result = permutation_importance(
    best_model,
    X_train,
    y_train,
    scoring='roc_auc',
    n_repeats=10,         # Anzahl Wiederholungen zur Stabilität
    random_state=42,
    n_jobs=1
)

# Feature-Namen aus Trainingsdaten extrahieren
feature_names = X_train.columns

# Indizes der Features nach absteigender mittlerer Wichtigkeit sortieren
sorted_idx = result.importances_mean.argsort()[::-1]

top_n = 10
top_idx = sorted_idx[:top_n]

# Feature-Namen bereinigen für bessere Lesbarkeit (Unterstriche durch Leerzeichen ersetzen, erste Buchstaben gross)
feature_names_clean = [name.replace('_', ' ').title() for name in feature_names]

# Horizontaler Balkendiagrammplot der Top-N Features mit mittlerer Wichtigkeit und Standardabweichung
plt.figure(figsize=(10, 6))
plt.barh(
    np.array(feature_names_clean)[top_idx],
    result.importances_mean[top_idx],
    xerr=result.importances_std[top_idx],
    color='salmon',
    align='center'
)
plt.xlabel("Mean decrease in ROC-AUC after permutation")
plt.title(f"Permutation Feature Importance (Top {top_n} Features)")
plt.gca().invert_yaxis()  # Wichtigste Features oben anzeigen
plt.tight_layout()
plt.show()

# Originalnamen der Top-n Features extrahieren
top_features = [feature_names[i] for i in top_idx]

# Layout mit 3x4 Subplots für bis zu 12 Features
fig, axs = plt.subplots(3, 4, figsize=(18, 12))
axs = axs.flatten()

# Partial Dependence Plots (PDP) für die Top-Features erzeugen
PartialDependenceDisplay.from_estimator(best_brf, X_train, top_features, ax=axs[:len(top_features)])

for ax in axs[:len(top_features)]:
    # X-Achse mit Tausender-Trennung formatieren
    ax.xaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{int(x):,}'))
    ax.grid(True, linestyle='--', alpha=0.4)
    ax.set_xlabel(ax.get_xlabel(), fontsize=10, fontweight='bold')
    ax.set_ylabel('Partial dependence', fontsize=10, fontweight='bold')
    ax.tick_params(axis='both', which='major', labelsize=9)

# Überflüssige Achsen entfernen, falls weniger Features als Subplots
for ax in axs[len(top_features):]:
    fig.delaxes(ax)

plt.suptitle("Partial Dependence Plots (PDP) für Top 10 Features", fontsize=16, fontweight='bold')
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.inspection import permutation_importance

# Optimiertes Modell in pipelines einfügen 
pipelines["Balanced RF (Optimiert)"] = brf_search.best_estimator_

# Baseline Logistic VIF hinzufügen
pipelines["Baseline Logistic Regression (VIF Reduced)"] = baseline_pipeline_reduced

# Auswahl der Modelle für den Plot
pipelines_subset = {
    "Balanced RF (Optimiert)": pipelines["Balanced RF (Optimiert)"],
    "Random Forest": pipelines["Random Forest"],
    "HistGradBoost": pipelines["HistGradBoost"],
    "XGBoost": pipelines["XGBoost"],
    "Baseline Logistic Regression (VIF Reduced)": pipelines["Baseline Logistic Regression (VIF Reduced)"]
}

# Farben für Modelle (Plotly Palette)
colors = ["#1f77b4", "#2ca02c", "#ff7f0e", "#d62728", "#9467bd"]

top_n = 10
pfi_results = {}

# Modelle trainieren und PFI berechnen
for name, pipe in pipelines_subset.items():
    print(f"Trainiere {name} und berechne PFI...")
    pipe.fit(X_train, y_train)
    result = permutation_importance(
        pipe, X_train, y_train,
        scoring='roc_auc', n_repeats=10,
        random_state=42, n_jobs=1
    )
    pfi_results[name] = result

# Max-Wert zur Skalierung finden
max_global_importance = max([res.importances_mean.max() for res in pfi_results.values()])

# Layout vorbereiten
num_models = len(pfi_results)
cols = 2
rows = (num_models + 1) // cols

fig = make_subplots(
    rows=rows,
    cols=cols,
    subplot_titles=list(pfi_results.keys()),
    horizontal_spacing=0.12,
    vertical_spacing=0.15
)

# Balken hinzufügen
for i, (name, result) in enumerate(pfi_results.items()):
    try:
        cat_pipeline = pipelines_subset[name].named_steps['prep'].named_transformers_['cat']
        ohe = cat_pipeline.named_steps['encode'] if hasattr(cat_pipeline, "named_steps") else cat_pipeline
        ohe_features = list(ohe.get_feature_names_out(cat_cols))
    except Exception as e:
        print(f"Konnte OHE für {name} nicht extrahieren: {e}")
        ohe_features = []

    base_features = list(num_reduced if "VIF" in name else num_cols)
    feature_names = base_features + ohe_features
    feature_names_clean = [f.replace('_', ' ').title() for f in feature_names]

    sorted_idx = result.importances_mean.argsort()[::-1]
    top_idx = sorted_idx[:top_n]

    x = result.importances_mean[top_idx][::-1]
    error_x = result.importances_std[top_idx][::-1]

    y = np.array([
        name.replace(" Client", "").replace("Transaction ", "Trans. ").replace(" Amount", "")
        if len(name) < 30 else name[:28] + "…"
        for name in np.array(feature_names_clean)[top_idx][::-1]
    ])

    row = i // cols + 1
    col = i % cols + 1

    fig.add_trace(
        go.Bar(
            x=x,
            y=y,
            error_x=dict(type='data', array=error_x),
            orientation='h',
            marker_color=colors[i % len(colors)],
            text=[f"{val:.3f}" for val in x],
            textposition="auto",
            hovertemplate="%{y}: %{x:.3f} ± %{error_x.array:.3f}",
            showlegend=False
        ),
        row=row,
        col=col
    )

    fig.update_xaxes(title_text="Permutation Importance", range=[0, max_global_importance], row=row, col=col)
    fig.update_yaxes(tickfont=dict(size=9), row=row, col=col)

# Layout
fig.update_layout(
    height=300 * rows,
    width=950,
    title_text="Top 10 Feature Importance (Permutation) – Ausgewählte Modelle",
    title_font=dict(size=16),
    margin=dict(t=80, l=40, r=20, b=40)
)

fig.show()

Trainiere Balanced RF (Optimiert) und berechne PFI...
Trainiere Random Forest und berechne PFI...
Trainiere HistGradBoost und berechne PFI...
Trainiere XGBoost und berechne PFI...
Trainiere Baseline Logistic Regression (VIF Reduced) und berechne PFI...

# 1. Auswahl der wichtigsten Features für das reduzierte Modell
selected_features = ['avg_balance', 'balance_before_cc', 'avg_trans_amount', 'max_balance', 'max_trans_amount']

# 2. Trainings- und Testdaten auf die ausgewählten Features reduzieren
X_train_reduced = X_train[selected_features]
X_test_reduced = X_test[selected_features]

# 3. Neues BalancedRandomForest-Modell m erstellen
model_reduced = BalancedRandomForestClassifier(
    n_estimators=400,
    random_state=42,
    class_weight='balanced'
)

# 4. Modelltraining auf den reduzierten Daten
model_reduced.fit(X_train_reduced, y_train)

# 5. Vorhersagen und Wahrscheinlichkeiten für Referenz- und reduziertes Modell berechnen
y_pred_ref = best_brf.predict(X_test)
y_proba_ref = best_brf.predict_proba(X_test)[:, 1]
y_pred_reduced = model_reduced.predict(X_test_reduced)
y_proba_reduced = model_reduced.predict_proba(X_test_reduced)[:, 1]

# ROC-AUC für beide Modelle berechnen
roc_ref = roc_auc_score(y_test, y_proba_ref)
roc_reduced = roc_auc_score(y_test, y_proba_reduced)

# Weitere Klassifikationsmetriken berechnen
metrics = {
    "ROC-AUC": [roc_ref, roc_reduced],
    "Accuracy": [accuracy_score(y_test, y_pred_ref), accuracy_score(y_test, y_pred_reduced)],
    "Precision": [precision_score(y_test, y_pred_ref), precision_score(y_test, y_pred_reduced)],
    "Recall": [recall_score(y_test, y_pred_ref), recall_score(y_test, y_pred_reduced)],
    "F1-Score": [f1_score(y_test, y_pred_ref), f1_score(y_test, y_pred_reduced)]
}

# DataFrame für den Vergleich der Metriken erstellen
df_comparison = pd.DataFrame(metrics, index=["Referenzmodell", "Reduziertes Modell"])
print(df_comparison)

# Werte für Balkendiagramm vorbereiten
labels = df_comparison.columns.tolist()
referenz = df_comparison.loc["Referenzmodell"].values
reduziert = df_comparison.loc["Reduziertes Modell"].values

# Balkendiagramm erstellen zum Vergleich der Modell-Performances
x = np.arange(len(labels))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))
rects1 = ax.bar(x - width/2, referenz, width, label='Referenzmodell')
rects2 = ax.bar(x + width/2, reduziert, width, label='Reduziertes Modell')

# Achsentitel und Diagrammtitel setzen
ax.set_ylabel('Score')
ax.set_title('Modellvergleich der Performance-Metriken')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

# Werte über den Balken anzeigen
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.3f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # vertikale Verschiebung
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

# Y-Achse begrenzen
plt.ylim(0, 1.1)

plt.show()

                    ROC-AUC  Accuracy  Precision  Recall  F1-Score
Referenzmodell         0.89      0.77       0.44    0.93      0.60
Reduziertes Modell     0.88      0.79       0.45    0.76      0.56

import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_lift_and_response_separate_models(y_true, y_scores_dict, segments=10):
    n_models = len(y_scores_dict)

    # Kürzere Modellnamen
    def shorten(name):
        if "Logistic" in name:
            return "Baseline LogReg"
        elif "Balanced RF" in name:
            return "Balanced RF"
        return name[:20] + "..."

    subplot_titles = []
    for name in y_scores_dict.keys():
        short = shorten(name)
        subplot_titles.extend([f"{short} Lift", f"{short} Response"])

    fig = make_subplots(
        rows=n_models, cols=2,
        subplot_titles=subplot_titles,
        vertical_spacing=0.25,
        horizontal_spacing=0.08
    )

    for i, (model_name, y_scores) in enumerate(y_scores_dict.items(), start=1):
        sorted_data = sorted(zip(y_true, y_scores), key=lambda x: x[1], reverse=True)
        y_sorted = np.array([y for y, _ in sorted_data])
        n = len(y_sorted)
        total_pos = y_sorted.sum()

        bin_size = n // segments
        lift_vals = []
        response_vals = []
        segment_centers = []

        for j in range(segments):
            start = j * bin_size
            end = (j + 1) * bin_size if j < segments - 1 else n
            segment = y_sorted[start:end]
            expected = len(segment) * (total_pos / n)
            lift_vals.append(segment.sum() / expected if expected > 0 else 0)
            response_vals.append(segment.mean() if len(segment) > 0 else 0)
            segment_centers.append((start + end) / 2 / n * 100)

        # Lift-Balken
        fig.add_trace(go.Bar(
            x=segment_centers,
            y=lift_vals,
            name=f"{shorten(model_name)} Lift",
            showlegend=(i == 1),
            marker_color='blue'
        ), row=i, col=1)
        fig.update_yaxes(range=[0, 4.5], title_text="Lift", row=i, col=1)

        # Response-Kurve
        fig.add_trace(go.Scatter(
            x=segment_centers,
            y=response_vals,
            name=f"{shorten(model_name)} Response",
            mode='lines+markers',
            marker=dict(color='orange'),
            showlegend=(i == 1)
        ), row=i, col=2)
        fig.update_yaxes(range=[0, 0.7], title_text="Response", row=i, col=2)

        # Zufallslinie (nur einmal oben)
        if i == 1:
            fig.add_trace(go.Scatter(
                x=[0, 100], y=[1, 1],
                mode='lines',
                line=dict(dash='dash', color='gray'),
                name="Zufall (Lift=1)",
                showlegend=True
            ), row=i, col=1)

        # X-Achsen: Ticks & Titel
        for col in [1, 2]:
            fig.update_xaxes(
                title_text="Top-Kundenanteil (%)",
                tickmode="linear",
                tick0=0,
                dtick=10,
                row=i,
                col=col
            )

    # Gemeinsames Layout
    fig.update_layout(
        height=320 * n_models,
        width=1000,
        title_text="Segmentierter Lift & Response pro Modell",
        legend=dict(
            x=1.02,
            y=1,
            orientation='v',
            font=dict(size=11),
            bgcolor='rgba(255,255,255,0)'
        ),
        margin=dict(t=100, r=180, l=50, b=60),
        plot_bgcolor='white',
        font=dict(size=13)
    )

    fig.update_xaxes(showgrid=True, gridcolor='lightgray', gridwidth=1, title_standoff=10)
    fig.update_yaxes(showgrid=True, gridcolor='lightgray', gridwidth=1, title_standoff=10)
    fig.show()


def plot_cumulative_lift(y_true, y_scores_dict):
    """
    Plottet kumulative Lift-Kurve für mehrere Modelle.
    """
    fig = go.Figure()

    for model_name, y_scores in y_scores_dict.items():
        sorted_data = sorted(zip(y_true, y_scores), key=lambda x: x[1], reverse=True)
        y_sorted = np.array([y for y, _ in sorted_data])
        total_pos = y_sorted.sum()
        cum_true_positives = np.cumsum(y_sorted)
        lift_cumulative = cum_true_positives / total_pos
        percents = np.arange(1, len(y_sorted) + 1) / len(y_sorted)

        fig.add_trace(go.Scatter(
            x=percents,
            y=lift_cumulative,
            mode='lines',
            name=model_name
        ))

    # Zufallslinie
    fig.add_trace(go.Scatter(
        x=percents,
        y=percents,
        mode='lines',
        name='Zufall',
        line=dict(dash='dash', color='gray')
    ))

    fig.update_layout(
        title="Kumulative Lift-Kurve – Modellvergleich",
        xaxis_title="Top-N Kunden (nach Score) [%]",
        yaxis_title="Kumulierter Anteil Käufer",
        xaxis=dict(tickmode="linear", tick0=0, dtick=0.1,
                   showgrid=True, gridcolor='lightgray'),
        yaxis=dict(showgrid=True, gridcolor='lightgray'),
        plot_bgcolor='white',
        font=dict(size=14),
        width=1000,
        height=450,
        margin=dict(t=80, l=60, r=60, b=60)
    )
    fig.show()



# Wahrscheinlichkeiten berechnen (Beispiel)
y_proba_baseline = pipelines["Baseline Logistic Regression (VIF Reduced)"].predict_proba(X_test)[:, 1]
y_proba_final = best_brf.predict_proba(X_test)[:, 1]

# Segmentierte Lift- & Response-Kurven plotten
plot_lift_and_response_separate_models(
    y_test,
    {
        "Baseline Logistic Regression (VIF Reduced)": y_proba_baseline,
        "Finales Modell (Balanced RF)": y_proba_final
    },
    segments=20
)

# Kumulative Lift-Kurve plotten
plot_cumulative_lift(
    y_test,
    {
        "Baseline Logistic Regression (VIF Reduced)": y_proba_baseline,
        "Finales Modell (Balanced RF)": y_proba_final
    }
)

	card_id	disp_id	type	issued
0	1005	9285	classic	1993-11-07
1	104	588	classic	1994-01-19
2	747	4915	classic	1994-02-05
3	70	439	classic	1994-02-08
4	577	3687	classic	1994-02-15

	card_id	issued
0	1005	1993-11-07
1	104	1994-01-19
2	747	1994-02-05
3	70	1994-02-08
4	577	1994-02-15
5	377	1994-03-03
6	721	1994-04-05
7	437	1994-06-01
8	188	1994-06-19
9	13	1994-06-29

	account_id	date
0	576	1993-01-01
1	3818	1993-01-01
2	704	1993-01-01
3	2378	1993-01-01
4	2632	1993-01-02
5	1972	1993-01-02
10	1726	1993-01-03
9	1695	1993-01-03
8	2484	1993-01-03
7	793	1993-01-03

	A1	A2	A3	A4	A5	A6	A7	A8	A9	A10	A11	A12	A13	A14	A15	A16
0	1	Hl.m. Praha	Prague	1204953	0	0	0	1	1	100.00	12541	0.29	0.43	167	85677	99107
1	2	Benesov	central Bohemia	88884	80	26	6	2	5	46.70	8507	1.67	1.85	132	2159	2674
2	3	Beroun	central Bohemia	75232	55	26	4	1	5	41.70	8980	1.95	2.21	111	2824	2813
3	4	Kladno	central Bohemia	149893	63	29	6	2	6	67.40	9753	4.64	5.05	109	5244	5892
4	5	Kolin	central Bohemia	95616	65	30	4	1	6	51.40	9307	3.85	4.43	118	2616	3040

	order_id	account_id	bank_to	account_to	amount	k_symbol
0	29401	1	YZ	87144583	2452.00	SIPO
1	29402	2	ST	89597016	3372.70	UVER
2	29403	2	QR	13943797	7266.00	SIPO
3	29404	3	WX	83084338	1135.00	SIPO
4	29405	3	CD	24485939	327.00

	card_id	disp_id	type	issued
count	892.00	892.00	892	892
unique	NaN	NaN	3	NaN
top	NaN	NaN	classic	NaN
freq	NaN	NaN	659	NaN
mean	480.86	3511.86	NaN	1997-09-19 05:19:38.475336320
min	1.00	9.00	NaN	1993-11-07 00:00:00
25%	229.75	1387.00	NaN	1997-01-25 12:00:00
50%	456.50	2938.50	NaN	1998-01-06 00:00:00
75%	684.25	4459.50	NaN	1998-08-05 06:00:00
max	1247.00	13660.00	NaN	1998-12-29 00:00:00
std	306.93	2984.37	NaN	NaN

	disp_id	client_id	account_id	type
count	5369.00	5369.00	5369.00	5369
unique	NaN	NaN	NaN	2
top	NaN	NaN	NaN	OWNER
freq	NaN	NaN	NaN	4500
mean	3337.10	3359.01	2767.50	NaN
std	2770.42	2832.91	2307.84	NaN
min	1.00	1.00	1.00	NaN
25%	1418.00	1418.00	1178.00	NaN
50%	2839.00	2839.00	2349.00	NaN
75%	4257.00	4257.00	3526.00	NaN
max	13690.00	13998.00	11382.00	NaN

	account_id	district_id	frequency	date
0	576	55	POPLATEK MESICNE	1993-01-01
1	3818	74	POPLATEK MESICNE	1993-01-01
2	704	55	POPLATEK MESICNE	1993-01-01
3	2378	16	POPLATEK MESICNE	1993-01-01
4	2632	24	POPLATEK MESICNE	1993-01-02

	account_id	district_id	frequency	date
count	4500.00	4500.00	4500	4500
unique	NaN	NaN	3	NaN
top	NaN	NaN	monthly_issuance	NaN
freq	NaN	NaN	4167	NaN
mean	2786.07	37.31	NaN	1995-08-08 01:38:52.800000
min	1.00	1.00	NaN	1993-01-01 00:00:00
25%	1182.75	13.00	NaN	1993-12-27 00:00:00
50%	2368.00	38.00	NaN	1996-01-02 00:00:00
75%	3552.25	60.00	NaN	1996-11-01 00:00:00
max	11382.00	77.00	NaN	1997-12-29 00:00:00
std	2313.81	25.18	NaN	NaN

	order_id	account_id	bank_to	account_to	amount	k_symbol
count	6471.00	6471.00	6471	6471	6471.00	5092
unique	NaN	NaN	13	6446	NaN	4
top	NaN	NaN	QR	29934013	NaN	household_payment
freq	NaN	NaN	531	2	NaN	3502
mean	33778.20	2962.30	NaN	NaN	3280.64	NaN
std	3737.68	2518.50	NaN	NaN	2714.48	NaN
min	29401.00	1.00	NaN	NaN	1.00	NaN
25%	31187.50	1223.00	NaN	NaN	1241.50	NaN
50%	32988.00	2433.00	NaN	NaN	2596.00	NaN
75%	34785.50	3645.50	NaN	NaN	4613.50	NaN
max	46338.00	11362.00	NaN	NaN	14882.00	NaN

	loan_id	account_id	date	amount	duration	payments	status
0	5314	1787	1993-07-05	96396.00	12	8033.00	B
1	5316	1801	1993-07-11	165960.00	36	4610.00	A
2	6863	9188	1993-07-28	127080.00	60	2118.00	A
3	5325	1843	1993-08-03	105804.00	36	2939.00	A
4	7240	11013	1993-09-06	274740.00	60	4579.00	A

	loan_id	account_id	date	amount	duration	payments	status
count	682.00	682.00	682	682.00	682.00	682.00	682
unique	NaN	NaN	NaN	NaN	NaN	NaN	4
top	NaN	NaN	NaN	NaN	NaN	NaN	running_ok
freq	NaN	NaN	NaN	NaN	NaN	NaN	403
mean	6172.47	5824.16	1996-09-29 05:35:43.108504448	151410.18	36.49	4190.66	NaN
min	4959.00	2.00	1993-07-05 00:00:00	4980.00	12.00	304.00	NaN
25%	5577.50	2967.00	1995-07-04 12:00:00	66732.00	24.00	2477.00	NaN
50%	6176.50	5738.50	1997-02-06 12:00:00	116928.00	36.00	3934.00	NaN
75%	6752.50	8686.00	1997-12-12 12:00:00	210654.00	48.00	5813.50	NaN
max	7308.00	11362.00	1998-12-08 00:00:00	590820.00	60.00	9910.00	NaN
std	682.58	3283.51	NaN	113372.41	17.08	2215.83	NaN

	trans_id	account_id	date	type	operation	amount	balance	k_symbol	bank	account
0	695247	2378	1993-01-01	PRIJEM	VKLAD	700.00	700.00	NaN	<NA>	<NA>
1	171812	576	1993-01-01	PRIJEM	VKLAD	900.00	900.00	NaN	<NA>	<NA>
2	207264	704	1993-01-01	PRIJEM	VKLAD	1000.00	1000.00	NaN	<NA>	<NA>
3	1117247	3818	1993-01-01	PRIJEM	VKLAD	600.00	600.00	NaN	<NA>	<NA>
4	579373	1972	1993-01-02	PRIJEM	VKLAD	400.00	400.00	NaN	<NA>	<NA>

	trans_id	account_id	date	type	operation	amount	balance	k_symbol
count	1056320.00	1056320.00	1056320	1056320	1056320	1056320.00	1056320.00	1056320
unique	NaN	NaN	NaN	2	6	NaN	NaN	8
top	NaN	NaN	NaN	withdrawal	withdrawal_in_cash	NaN	NaN	unknown
freq	NaN	NaN	NaN	651237	434918	NaN	NaN	535314
mean	1335310.70	2936.87	1997-01-04 07:29:27.037261952	NaN	NaN	186.64	38518.33	NaN
min	1.00	1.00	1993-01-01 00:00:00	NaN	NaN	-87400.00	-41125.70	NaN
25%	430262.75	1204.00	1996-01-16 00:00:00	NaN	NaN	-3019.00	22402.50	NaN
50%	858506.50	2434.00	1997-04-10 00:00:00	NaN	NaN	-14.60	33143.40	NaN
75%	2060979.25	3660.00	1998-02-28 00:00:00	NaN	NaN	200.00	49603.62	NaN
max	3682987.00	11382.00	1998-12-31 00:00:00	NaN	NaN	74812.00	209637.00	NaN
std	1227486.51	2477.35	NaN	NaN	NaN	11213.53	22117.87	NaN

	client_id	birth_number	district_id_x	gender	age	disp_id	account_id	type_x	card_id	type_y	issued	district_id_y	frequency	date
0	1	1970-12-13	18	F	28	1	1	OWNER	NaN	NaN	NaT	18	monthly_issuance	1995-03-24
1	2	1945-02-04	1	M	53	2	2	OWNER	NaN	NaN	NaT	1	monthly_issuance	1993-02-26
2	3	1940-10-09	1	F	58	3	2	DISPONENT	NaN	NaN	NaT	1	monthly_issuance	1993-02-26
3	4	1956-12-01	5	M	42	4	3	OWNER	NaN	NaN	NaT	5	monthly_issuance	1997-07-07
4	5	1960-07-03	5	F	38	5	3	DISPONENT	NaN	NaN	NaT	5	monthly_issuance	1997-07-07
5	6	1919-09-22	12	M	79	6	4	OWNER	NaN	NaN	NaT	12	monthly_issuance	1996-02-21
6	7	1929-01-25	15	M	69	7	5	OWNER	NaN	NaN	NaT	15	monthly_issuance	1997-05-30
7	8	1938-02-21	51	F	60	8	6	OWNER	NaN	NaN	NaT	51	monthly_issuance	1994-09-27
8	9	1935-10-16	60	M	63	9	7	OWNER	1.00	gold	1998-10-16	60	monthly_issuance	1996-11-24
9	10	1943-05-01	57	M	55	10	8	OWNER	NaN	NaN	NaT	57	monthly_issuance	1995-09-21

	client_id	birth_number	district_id_client	gender	age	disp_id	account_id	disposition_type	card_id	credit_card_type	...	n_municipals_between_2000_9999	n_municipals_higher_10000	n_cities	ratio_urban_inhabitants	avg_salary	unemployment_rate_1995	unemployment_rate_1996	n_enterpreneurs_per_1k_inhabitants	n_crimes_1995	n_crimes_1996
8	9	1935-10-16	60	M	63	9	7	OWNER	1.00	gold	...	4	1	4	51.90	8441	3.45	4.48	115	1879.00	2252
18	19	1942-12-28	47	M	56	19	14	OWNER	2.00	classic	...	5	1	6	72.80	9538	1.51	1.81	111	6079.00	5410
38	41	1968-08-27	22	M	30	41	33	OWNER	3.00	gold	...	4	1	6	52.40	8620	1.10	1.25	100	1089.00	1117
39	42	1935-08-17	68	M	63	42	34	OWNER	4.00	classic	...	18	2	6	57.20	9893	4.09	4.72	96	5623.00	5887
48	51	1979-12-02	36	F	19	51	43	OWNER	5.00	junior	...	8	1	9	85.20	9198	3.33	4.28	131	5796.00	6132
52	56	1960-03-31	21	M	38	56	48	OWNER	7.00	classic	...	7	1	7	67.00	9104	1.51	2.07	123	2299.00	2354
56	60	1980-02-19	67	M	18	60	51	OWNER	8.00	junior	...	6	2	6	63.10	8110	5.77	6.55	109	3244.00	3079
71	76	1967-10-01	36	F	31	76	65	OWNER	9.00	classic	...	8	1	9	85.20	9198	3.33	4.28	131	5796.00	6132
72	77	1956-02-18	1	F	42	77	66	OWNER	10.00	classic	...	0	1	1	100.00	12541	0.29	0.43	167	85677.00	99107
74	79	1969-03-10	37	F	29	79	68	OWNER	11.00	gold	...	6	2	9	62.30	9065	4.46	5.39	123	4147.00	4166

	client_id	birth_number	district_id_client	gender	age	disp_id	account_id	disposition_type	card_id	credit_card_type	...	n_cities	ratio_urban_inhabitants	avg_salary	unemployment_rate_1995	unemployment_rate_1996	n_enterpreneurs_per_1k_inhabitants	n_crimes_1995	n_crimes_1996	has_cc	cc_purchase_date
8	9	1935-10-16	60	M	63	9	7	OWNER	1.00	gold	...	4	51.90	8441	3.45	4.48	115	1879.00	2252	1	1998-10-16
18	19	1942-12-28	47	M	56	19	14	OWNER	2.00	classic	...	6	72.80	9538	1.51	1.81	111	6079.00	5410	1	1998-03-13
38	41	1968-08-27	22	M	30	41	33	OWNER	3.00	gold	...	6	52.40	8620	1.10	1.25	100	1089.00	1117	1	1995-09-03
39	42	1935-08-17	68	M	63	42	34	OWNER	4.00	classic	...	6	57.20	9893	4.09	4.72	96	5623.00	5887	1	1998-11-26
52	56	1960-03-31	21	M	38	56	48	OWNER	7.00	classic	...	7	67.00	9104	1.51	2.07	123	2299.00	2354	1	1998-06-11

	client_id	total_spent	num_transactions	avg_balance	max_balance	min_balance	std_balance	avg_trans_amount	med_trans_amount	max_trans_amount	...	trans_k_symbol	num_loans	loan_amount	loan_duration	loan_payments	loan_status	num_perm_orders	total_order_amount	avg_order_amount	order_k_symbol
0	9	3058.40	76	67663.67	94463.20	45378.90	11715.44	40.24	-14.60	33975.00	...	unknown	0	0.00	0.00	0.00	missing	1	4880.00	4880.00	household_payment
1	19	3018.80	62	38999.41	54029.20	30565.90	6256.09	48.69	-14.60	22137.00	...	unknown	0	0.00	0.00	0.00	missing	1	3629.00	3629.00	household_payment
2	41	5542.80	71	57595.67	90454.90	43568.60	10541.14	78.07	-14.60	35384.00	...	unknown	0	0.00	0.00	0.00	missing	1	3892.00	3892.00	household_payment
3	42	19933.20	264	53658.05	90450.50	33049.90	14855.75	75.50	-502.00	47768.00	...	unknown	0	0.00	0.00	0.00	missing	3	8051.00	2683.67	household_payment
4	56	-28867.20	182	46835.34	72802.50	23038.10	10842.07	-158.61	-1029.00	34221.00	...	unknown	0	0.00	0.00	0.00	missing	2	7297.00	3648.50	household_payment
5	76	103591.80	201	55854.26	82266.30	600.00	16056.67	515.38	-686.00	36885.00	...	unknown	0	0.00	0.00	0.00	missing	3	7402.00	2467.33	household_payment
6	77	6963.80	70	63920.13	127894.60	7811.40	25483.83	99.48	-100.00	67070.00	...	unknown	0	0.00	0.00	0.00	missing	1	4471.30	4471.30	leasing_payment
7	79	35148.80	500	75014.51	108994.30	29408.30	17998.80	70.30	-137.00	53336.00	...	unknown	0	0.00	0.00	0.00	missing	4	3089.00	772.25	household_payment
8	87	55443.40	31	45905.98	61504.40	400.00	14383.75	1788.50	164.30	28666.50	...	unknown	0	0.00	0.00	0.00	missing	1	6061.00	6061.00	household_payment
9	112	59146.80	48	48462.82	80319.00	200.00	15014.22	1232.22	-14.60	35890.00	...	unknown	0	0.00	0.00	0.00	missing	0	0.00	0.00	missing
10	114	278543.50	595	69144.19	124923.20	22442.90	22117.07	468.14	-644.00	66332.00	...	unknown	0	0.00	0.00	0.00	missing	5	8160.10	1632.02	household_payment
11	116	120650.50	610	41531.52	57172.40	24553.00	6955.75	197.79	-15.00	26892.00	...	unknown	1	102876.00	12.00	8573.00	finished_ok	5	12438.00	2487.60	household_payment
12	127	35889.50	28	39236.71	74693.60	800.00	17997.52	1281.77	184.25	49101.00	...	unknown	0	0.00	0.00	0.00	missing	1	7348.00	7348.00	loan_payment
13	128	36058.10	38	41884.39	67012.00	500.00	12480.63	948.90	-14.60	26648.00	...	unknown	0	0.00	0.00	0.00	missing	1	3735.00	3735.00	household_payment
14	130	4736.70	71	51259.83	72097.60	35155.20	7271.79	66.71	-14.60	23554.50	...	unknown	0	0.00	0.00	0.00	missing	1	4803.00	4803.00	household_payment
15	132	35554.50	312	45917.31	70035.10	26316.90	10557.06	113.96	-349.00	35157.00	...	unknown	1	162576.00	36.00	4516.00	running_ok	3	7664.00	2554.67	household_payment
16	138	11847.90	71	36760.95	50977.90	23623.10	6183.48	166.87	-14.60	11498.00	...	unknown	0	0.00	0.00	0.00	missing	1	5110.00	5110.00	household_payment
17	146	2484.60	59	56156.39	83683.80	40858.80	11675.50	42.11	-14.60	33903.00	...	unknown	0	0.00	0.00	0.00	missing	0	0.00	0.00	missing
18	158	-22595.70	95	54477.29	144111.50	12811.70	29016.97	-237.85	-100.00	46248.00	...	unknown	1	88440.00	12.00	7370.00	finished_ok	1	7370.20	7370.20	loan_payment
19	161	37297.70	33	35464.56	47696.70	300.00	10791.01	1130.23	143.20	14446.00	...	unknown	0	0.00	0.00	0.00	missing	0	0.00	0.00	missing
20	192	2150.90	73	43474.61	77770.30	21792.00	11794.96	29.46	-14.60	33081.00	...	unknown	0	0.00	0.00	0.00	missing	1	8694.00	8694.00	household_payment
21	208	19489.20	78	45644.89	77195.10	24818.10	11246.25	249.86	-367.30	33584.00	...	unknown	0	0.00	0.00	0.00	missing	1	4221.00	4221.00	household_payment
22	219	-4838.20	76	63439.46	96782.90	16578.90	17043.92	-63.66	-30.00	34892.00	...	unknown	0	0.00	0.00	0.00	missing	0	0.00	0.00	missing
23	225	60004.60	47	47443.27	66806.50	1000.00	12765.70	1276.69	-1000.00	28846.00	...	unknown	0	0.00	0.00	0.00	missing	1	6526.00	6526.00	household_payment
24	231	48235.00	69	55647.91	104997.50	25479.10	20054.32	699.06	-14.60	66314.00	...	unknown	0	0.00	0.00	0.00	missing	1	4420.90	4420.90	leasing_payment
25	236	2011.20	73	39706.70	56095.60	23063.20	6444.60	27.55	-14.60	21924.00	...	unknown	0	0.00	0.00	0.00	missing	1	6260.00	6260.00	household_payment
26	238	84857.20	436	43542.54	55792.00	26077.20	6608.01	194.63	-385.00	19408.00	...	unknown	0	0.00	0.00	0.00	missing	4	8626.00	2156.50	household_payment
27	242	15730.20	59	49101.27	63940.20	40488.40	5798.41	266.61	-14.60	12921.00	...	unknown	0	0.00	0.00	0.00	missing	0	0.00	0.00	missing
28	254	87615.70	57	75997.47	151101.30	800.00	30330.23	1537.12	219.90	48433.00	...	unknown	0	0.00	0.00	0.00	missing	1	2394.00	2394.00	leasing_payment
29	255	-9043.50	90	61332.41	115490.70	10335.50	22959.02	-100.48	-4203.00	74385.00	...	unknown	0	0.00	0.00	0.00	missing	1	13841.00	13841.00	household_payment
30	256	6158.50	79	57416.37	85656.00	39899.00	11874.86	77.96	-2000.00	36408.00	...	unknown	0	0.00	0.00	0.00	missing	1	5625.00	5625.00	household_payment
31	267	-14792.40	74	49031.25	89619.40	28244.90	15495.51	-199.90	-14.60	51614.00	...	unknown	0	0.00	0.00	0.00	missing	1	3440.90	3440.90	leasing_payment
32	272	97877.40	90	44790.40	72338.60	1000.00	12181.03	1087.53	21.20	28588.00	...	unknown	0	0.00	0.00	0.00	missing	2	18040.00	9020.00	household_payment
33	273	4542.90	76	46721.23	100098.40	10490.60	19893.41	59.78	-14.60	67038.00	...	unknown	0	0.00	0.00	0.00	missing	1	9381.00	9381.00	household_payment
34	280	39156.00	35	46305.45	78053.90	700.00	14540.56	1118.74	11.60	33514.00	...	unknown	0	0.00	0.00	0.00	missing	1	3643.00	3643.00	household_payment
35	295	-5785.20	318	51906.47	58955.70	46078.60	3531.58	-18.19	-226.00	3972.00	...	unknown	0	0.00	0.00	0.00	missing	3	2648.00	882.67	household_payment
36	305	12573.40	67	56597.16	81559.80	34000.90	10676.04	187.66	-14.60	33464.00	...	unknown	0	0.00	0.00	0.00	missing	0	0.00	0.00	missing
37	309	-14611.70	64	57366.20	76527.20	33101.60	12078.19	-228.31	-14.60	28440.00	...	unknown	0	0.00	0.00	0.00	missing	0	0.00	0.00	missing
38	321	-3596.00	79	66694.10	136506.30	27933.20	23404.84	-45.52	-1800.00	74628.00	...	unknown	0	0.00	0.00	0.00	missing	1	4975.20	4975.20	leasing_payment
39	326	-15824.50	75	71514.07	88571.00	39599.60	12414.23	-210.99	-14.60	35636.00	...	unknown	0	0.00	0.00	0.00	missing	1	5462.00	5462.00	household_payment

	client_id	total_spent	num_transactions	avg_balance	max_balance	min_balance	std_balance	avg_trans_amount	med_trans_amount	max_trans_amount	...	loan_status	num_perm_orders	total_order_amount	avg_order_amount	order_k_symbol	month
0	9	3058.40	76	67663.67	94463.20	45378.90	11715.44	40.24	-14.60	33975.00	...	missing	1	4880.00	4880.00	household_payment	1998-10-01
1	19	3018.80	62	38999.41	54029.20	30565.90	6256.09	48.69	-14.60	22137.00	...	missing	1	3629.00	3629.00	household_payment	1998-03-01
2	41	5542.80	71	57595.67	90454.90	43568.60	10541.14	78.07	-14.60	35384.00	...	missing	1	3892.00	3892.00	household_payment	1995-09-01
3	42	19933.20	264	53658.05	90450.50	33049.90	14855.75	75.50	-502.00	47768.00	...	missing	3	8051.00	2683.67	household_payment	1998-11-01
4	56	-28867.20	182	46835.34	72802.50	23038.10	10842.07	-158.61	-1029.00	34221.00	...	missing	2	7297.00	3648.50	household_payment	1998-06-01
5	76	103591.80	201	55854.26	82266.30	600.00	16056.67	515.38	-686.00	36885.00	...	missing	3	7402.00	2467.33	household_payment	1997-10-01
6	77	6963.80	70	63920.13	127894.60	7811.40	25483.83	99.48	-100.00	67070.00	...	missing	1	4471.30	4471.30	leasing_payment	1996-12-01
7	79	35148.80	500	75014.51	108994.30	29408.30	17998.80	70.30	-137.00	53336.00	...	missing	4	3089.00	772.25	household_payment	1997-10-01
8	87	55443.40	31	45905.98	61504.40	400.00	14383.75	1788.50	164.30	28666.50	...	missing	1	6061.00	6061.00	household_payment	1994-06-01
9	112	59146.80	48	48462.82	80319.00	200.00	15014.22	1232.22	-14.60	35890.00	...	missing	0	0.00	0.00	missing	1996-02-01

	client_id	birth_number	district_id_client	gender	age	disp_id	account_id	disposition_type	card_id	credit_card_type	...	n_municipals_between_2000_9999	n_municipals_higher_10000	n_cities	ratio_urban_inhabitants	avg_salary	unemployment_rate_1995	unemployment_rate_1996	n_enterpreneurs_per_1k_inhabitants	n_crimes_1995	n_crimes_1996
0	1	1970-12-13	18	F	28	1	1	OWNER	NaN	NaN	...	2	1	4	65.30	8968	2.83	3.35	131	1740.00	1910
1	2	1945-02-04	1	M	53	2	2	OWNER	NaN	NaN	...	0	1	1	100.00	12541	0.29	0.43	167	85677.00	99107
2	3	1940-10-09	1	F	58	3	2	DISPONENT	NaN	NaN	...	0	1	1	100.00	12541	0.29	0.43	167	85677.00	99107
3	4	1956-12-01	5	M	42	4	3	OWNER	NaN	NaN	...	4	1	6	51.40	9307	3.85	4.43	118	2616.00	3040
4	5	1960-07-03	5	F	38	5	3	DISPONENT	NaN	NaN	...	4	1	6	51.40	9307	3.85	4.43	118	2616.00	3040

	account_id	district_id	frequency	date_x	order_id	bank_to	account_to	amount_x	k_symbol_x	trans_id	...	amount_y	balance	k_symbol_y	year_month	loan_id	date	amount	duration	payments	status
0	576	55	monthly_issuance	1993-01-01	30253.00	OP	71033382	3662.00	household_payment	171812	...	900.00	900.00	unknown	1993-01	NaN	NaT	NaN	NaN	NaN	NaN
1	576	55	monthly_issuance	1993-01-01	30253.00	OP	71033382	3662.00	household_payment	171813	...	6207.00	7107.00	oldage_pension	1993-01	NaN	NaT	NaN	NaN	NaN	NaN
2	576	55	monthly_issuance	1993-01-01	30253.00	OP	71033382	3662.00	household_payment	3549613	...	20.10	7127.10	interest_credited	1993-01	NaN	NaT	NaN	NaN	NaN	NaN
3	576	55	monthly_issuance	1993-01-01	30253.00	OP	71033382	3662.00	household_payment	171814	...	6207.00	13334.10	oldage_pension	1993-02	NaN	NaT	NaN	NaN	NaN	NaN
4	576	55	monthly_issuance	1993-01-01	30253.00	OP	71033382	3662.00	household_payment	3549614	...	29.60	13363.70	interest_credited	1993-02	NaN	NaT	NaN	NaN	NaN	NaN

	account_id	district_id	frequency	account_date	order_id	bank_to	account_to	order_amount	k_symbol_order	trans_id	...	trans_amount	balance	k_symbol_trans	year_month	loan_id	loan_date	loan_amount	duration	payments	status
count	1858450.00	1858450.00	1858450	1858450	1715140.00	1715140	1715140	1715140.00	1302638	1858450.00	...	1858450.00	1858450.00	1858450	1858450	449736.00	449736	449736.00	449736.00	449736.00	449736
unique	NaN	NaN	3	NaN	NaN	13	6446	NaN	4	NaN	...	NaN	NaN	8	72	NaN	NaN	NaN	NaN	NaN	4
top	NaN	NaN	monthly_issuance	NaN	NaN	YZ	89597016	NaN	household_payment	NaN	...	NaN	NaN	unknown	1998-01	NaN	NaN	NaN	NaN	NaN	running_ok
freq	NaN	NaN	1695012	NaN	NaN	139314	956	NaN	850207	NaN	...	NaN	NaN	961850	72986	NaN	NaN	NaN	NaN	NaN	214979
mean	3092.54	37.16	NaN	1994-11-17 18:33:06.509618176	33974.21	NaN	NaN	3164.05	NaN	1338948.42	...	172.69	39053.70	NaN	NaN	6194.10	1996-03-08 05:43:31.003788800	140498.13	35.47	4038.11	NaN
min	1.00	1.00	NaN	1993-01-01 00:00:00	29401.00	NaN	NaN	1.00	NaN	1.00	...	-87400.00	-41125.70	NaN	NaN	4959.00	1993-07-05 00:00:00	4980.00	12.00	304.00	NaN
25%	1249.00	13.00	NaN	1993-07-28 00:00:00	31228.00	NaN	NaN	1069.00	NaN	435635.25	...	-3019.00	23046.70	NaN	NaN	5595.00	1994-11-03 00:00:00	57120.00	24.00	2118.00	NaN
50%	2504.00	38.00	NaN	1994-08-12 00:00:00	33086.00	NaN	NaN	2454.00	NaN	866916.00	...	-62.00	33999.25	NaN	NaN	6216.00	1996-03-13 00:00:00	102876.00	36.00	3736.00	NaN
75%	3774.00	61.00	NaN	1996-03-15 00:00:00	34928.00	NaN	NaN	4545.00	NaN	2121885.00	...	168.70	50198.10	NaN	NaN	6758.00	1997-06-14 00:00:00	194280.00	48.00	5700.00	NaN
max	11382.00	77.00	NaN	1997-12-29 00:00:00	46338.00	NaN	NaN	14882.00	NaN	3682987.00	...	74812.00	209637.00	NaN	NaN	7308.00	1998-12-08 00:00:00	590820.00	60.00	9910.00	NaN
std	2622.85	25.45	NaN	NaN	3918.64	NaN	NaN	2690.10	NaN	1214786.21	...	11025.96	21866.56	NaN	NaN	677.57	NaN	111857.32	17.55	2258.83	NaN

	client_id	total_spent	num_transactions	avg_balance	max_balance	min_balance	std_balance	avg_trans_amount	med_trans_amount	max_trans_amount	...	trans_k_symbol	loan_status	num_perm_orders	total_order_amount	avg_order_amount	order_k_symbol
0	1	4952.30	69	15086.04	18790.40	10722.00	1981.35	71.77	-14.60	3679.00	...	unknown	missing	1	2452.00	2452.00	household_payment
1	2	10741.00	176	42017.28	62182.90	20515.70	9907.00	61.03	-2650.00	30354.00	...	unknown	missing	2	10638.70	5319.35	household_payment
2	4	0.00	0	0.00	0.00	0.00	0.00	0.00	0.00	0.00	...	missing	missing	3	5001.00	1667.00	household_payment
3	6	9764.00	162	19615.77	25874.00	11138.10	3111.67	60.27	-640.00	5553.00	...	unknown	missing	2	3363.00	1681.50	household_payment
4	7	0.00	0	0.00	0.00	0.00	0.00	0.00	0.00	0.00	...	missing	missing	1	2668.00	2668.00	household_payment
5	8	10742.00	67	40325.50	49769.30	32695.00	4294.62	160.33	-14.60	6669.00	...	unknown	missing	1	3954.00	3954.00	household_payment
6	10	-46708.80	190	45456.37	75251.40	19024.00	10646.48	-245.84	-2612.00	30712.00	...	unknown	missing	2	9324.00	4662.00	household_payment
7	14	13439.90	63	24377.88	32171.00	14333.60	4651.36	213.33	-14.60	3494.00	...	unknown	missing	1	2132.00	2132.00	household_payment
8	15	2452.60	162	20348.18	29053.00	11086.80	4746.87	15.14	-297.00	5938.00	...	unknown	missing	2	3592.00	1796.00	household_payment
9	17	8106.20	61	15593.24	29750.60	8631.70	4383.84	132.89	-14.60	6803.00	...	unknown	missing	1	2444.00	2444.00	household_payment

	column	dtype
0	total_spent	float64
1	num_transactions	int64
2	avg_balance	float64
3	max_balance	float64
4	min_balance	float64
5	std_balance	float64
6	avg_trans_amount	float64
7	med_trans_amount	float64
8	max_trans_amount	float64
9	min_trans_amount	float64
10	std_trans_amount	float64
11	balance_before_cc	float64
12	transaction_type	object
13	transaction_operation	object
14	trans_k_symbol	object
15	num_loans	int64
16	loan_amount	float64
17	loan_duration	float64
18	loan_payments	float64
19	loan_status	object
20	num_perm_orders	int64
21	total_order_amount	float64
22	avg_order_amount	float64
23	order_k_symbol	object
24	month	datetime64[ns]
25	gender	object
26	age	int64
27	frequency	category
28	date	datetime64[ns]
29	client_district_name	string[python]
30	client_region	string[python]
31	client_n_inhabitants	int64
32	client_n_municipals_lower_499	int64
33	client_n_municipals_between_500_1999	int64
34	client_n_municipals_between_2000_9999	int64
35	client_n_municipals_higher_10000	int64
36	client_n_cities	int64
37	client_ratio_urban_inhabitants	float64
38	client_avg_salary	int64
39	client_unemployment_rate_1995	float64
40	client_unemployment_rate_1996	float64
41	client_n_enterpreneurs_per_1k_inhabitants	int64
42	client_n_crimes_1995	float64
43	client_n_crimes_1996	int64

	accuracy	precision	recall	f1	roc_auc
Baseline (class_weight)	0.77	0.43	0.78	0.56	0.84
SMOTE	0.78	0.43	0.77	0.55	0.84
Undersampling	0.77	0.42	0.77	0.55	0.84

	feature	VIF_before	VIF_after	ΔVIF
4	med_trans_amount	2.48	2.06	-0.42
0	total_spent	2.51	1.51	-1.00
3	avg_trans_amount	3.57	2.46	-1.11
14	client_n_municipals_higher_10000	7.64	5.05	-2.59
11	client_n_municipals_lower_499	6.16	3.54	-2.62
12	client_n_municipals_between_500_1999	12.36	7.54	-4.82
10	age	12.93	7.74	-5.20
1	num_transactions	8.89	2.44	-6.45
9	avg_order_amount	10.94	2.76	-8.18
13	client_n_municipals_between_2000_9999	16.57	5.79	-10.78
2	min_balance	13.36	2.47	-10.89
5	min_trans_amount	19.79	2.47	-17.31
6	loan_amount	28.01	8.67	-19.35
8	loan_payments	28.70	5.89	-22.81
7	loan_duration	27.99	4.50	-23.49
15	client_unemployment_rate_1995	86.22	5.81	-80.41
16	client_n_crimes_1996	4154.35	2.21	-4152.14

	loan_amount	age	client_n_municipals_between_500_1999	loan_payments	client_unemployment_rate_1995	client_n_municipals_between_2000_9999	client_n_municipals_higher_10000	loan_duration	client_n_municipals_lower_499	avg_order_amount	min_trans_amount	min_balance	avg_trans_amount	num_transactions	client_n_crimes_1996	med_trans_amount	total_spent	transaction_type_withdrawal	transaction_operation_withdrawal_in_cash	trans_k_symbol_unknown	loan_status_missing	order_k_symbol_leasing_payment	order_k_symbol_loan_payment	gender_M	frequency_monthly_issuance	client_district_name_Decin	client_district_name_Litomerice	client_district_name_Mlada Boleslav	client_district_name_Sumperk	client_district_name_Uherske Hradiste	client_region_central Bohemia	client_region_north Bohemia	client_region_north Moravia	client_region_south Moravia
0	-0.16	-1.38	-0.26	-0.18	1.57	0.32	1.18	-0.19	-0.46	1.67	0.38	-1.01	-0.08	-0.62	-0.36	0.15	-0.10	0.00	1.00	1.00	1.00	0.00	0.00	1.00	1.00	1.00	0.00	0.00	0.00	0.00	0.00	1.00	0.00	0.00
1	-0.16	-1.25	0.32	-0.18	0.86	0.09	0.27	-0.19	0.91	1.63	-2.57	1.05	-0.19	0.88	-0.38	0.10	0.43	1.00	1.00	1.00	1.00	0.00	0.00	0.00	1.00	0.00	1.00	0.00	0.00	0.00	0.00	1.00	0.00	0.00
2	-0.16	-0.98	1.30	-0.18	-0.87	1.67	0.27	-0.19	-0.52	0.83	-0.31	0.02	-0.37	-0.33	-0.42	0.10	-0.41	1.00	1.00	1.00	1.00	0.00	1.00	1.00	1.00	0.00	0.00	0.00	0.00	1.00	0.00	0.00	0.00	1.00
3	-0.16	-0.04	0.72	-0.18	1.01	1.67	0.27	-0.19	-0.26	-0.13	-0.78	-1.00	1.62	-0.63	-0.43	0.10	1.51	1.00	1.00	1.00	1.00	1.00	0.00	0.00	1.00	0.00	0.00	0.00	1.00	0.00	0.00	0.00	1.00	0.00
4	-0.16	0.97	-0.13	-0.18	-0.89	0.32	-0.65	-0.19	1.61	0.39	-2.49	1.16	-0.31	1.09	-0.35	0.08	-0.16	1.00	1.00	1.00	1.00	0.00	0.00	1.00	1.00	0.00	0.00	1.00	0.00	0.00	1.00	0.00	0.00	0.00

	Baseline (class_weight)	VIF-reduziert (mean)	Δ (VIF − Baseline)
accuracy	0.77	0.76	-0.01
precision	0.43	0.41	-0.02
recall	0.78	0.75	-0.03
f1	0.56	0.53	-0.02
roc_auc	0.84	0.83	-0.01

	accuracy	precision	recall	f1	roc_auc
Random Forest	0.84	0.61	0.34	0.44	0.90
Balanced RF	0.80	0.48	0.89	0.62	0.90
XGBoost	0.84	0.57	0.58	0.57	0.89
HistGradBoost	0.84	0.55	0.67	0.60	0.90
LogReg (VIF reduced)	0.76	0.41	0.75	0.53	0.83

	threshold	precision	recall	f1
0	0.10	0.40	1.00	0.57
1	0.15	0.40	1.00	0.57
2	0.20	0.40	1.00	0.57
3	0.25	0.40	0.99	0.57
4	0.30	0.41	0.99	0.58
5	0.35	0.42	0.98	0.58
6	0.40	0.42	0.96	0.59
7	0.45	0.43	0.95	0.59
8	0.50	0.44	0.93	0.60
9	0.55	0.44	0.88	0.59
10	0.60	0.46	0.85	0.60
11	0.65	0.48	0.78	0.60
12	0.70	0.49	0.71	0.58
13	0.75	0.52	0.59	0.55
14	0.80	0.54	0.45	0.49
15	0.85	0.63	0.33	0.43
16	0.90	0.68	0.14	0.23