信用評分卡模型在Python中實踐(上)
http://dwz.date/b9vv

一、 前言? ?
? ? 之前看到信用標準評分卡模型開發(fā)及實現(xiàn)的文章,是標準的評分卡建模流程在R上的實現(xiàn),非常不錯,就想著能不能把開發(fā)流程在Python上實驗一遍呢,經(jīng)過一番折騰后,終于在Python上用類似的代碼和包實現(xiàn)出來,由于Python和R上函數(shù)的差異以及樣本抽樣的差異,本文的結果與該文有一定的差異,這是意料之中的,也是正常,接下來就介紹建模的流程和代碼實現(xiàn)。
#####代碼中需要引用的包#####
import
?numpy as np
import
?pandas as pd
from
?sklearn.utils?
import
?shuffle
from
?sklearn.feature_selection?
import
?RFE, f_regression
import
?scipy.stats.stats as stats
import
?matplotlib.pyplot as plt
from
?sklearn.linear_model?
import
?LogisticRegression
import
?math
import
?matplotlib.pyplot as plt
二、數(shù)據(jù)集準備
? ?數(shù)據(jù)來自互聯(lián)網(wǎng)上經(jīng)常被用來研究信用風險評級模型的加州大學機器學習數(shù)據(jù)庫中的german credit data,原本是存在R包”klaR”中的GermanCredit,我在R中把它加載進去,然后導出csv,最終導入Python作為數(shù)據(jù)集
############## R #################
library(klaR)
data(GermanCredit ,package
=
"klaR"
)
write.csv(GermanCredit,
"/filePath/GermanCredit.csv"
)
? ? 該數(shù)據(jù)集包含了1000個樣本,每個樣本包括了21個變量(屬性),其中包括1個違約狀態(tài)變量“credit_risk”,剩余20個變量包括了所有的7個定量和13個定性指標
>>> df_raw?
=
?pd.read_csv(
'/filePath/GermanCredit.csv'
)
>>> df_raw.dtypes
Unnamed:?
0
??????????????????int64
status?????????????????????
object
duration??????????????????? int64
credit_history?????????????
object
purpose????????????????????
object
amount????????????????????? int64
savings????????????????????
object
employment_duration????????
object
installment_rate??????????? int64
personal_status_sex????????
object
other_debtors??????????????
object
present_residence?????????? int64
property
???????????????????object
age???????????????????????? int64
other_installment_plans????
object
housing????????????????????
object
number_credits????????????? int64
job????????????????????????
object
people_liable?????????????? int64
telephone??????????????????
object
foreign_worker?????????????
object
credit_risk????????????????
object
? ?接下來對數(shù)據(jù)集進行拆分,按照7:3拆分訓練集和測試集,并將違約樣本用“1”表示,正常樣本用“0”表示。
#提取樣本訓練集和測試集
def
?split_data(data, ratio
=
0.7
, seed
=
None
):
????
if
?seed:
????????
shuffle_data?
=
?shuffle(data, random_state
=
seed)
????
else
:
????????
shuffle_data?
=
?shuffle(data, random_state
=
np.random.randint(
10000
))
????
train?
=
?shuffle_data.iloc[:
int
(ratio
*
len
(shuffle_data)), ]
????
test?
=
?shuffle_data.iloc[
int
(ratio
*
len
(shuffle_data)):, ]
????
return
?train, test
?
?#設置seed是為了保證下次拆分的結果一致
df_train,df_test?
=
?split_data(df_raw, ratio
=
0.7
, seed
=
666
)
#將違約樣本用“1”表示,正常樣本用“0”表示。
credit_risk?
=
?[
0
?if
?x
=
=
'good'
?else
?1
?for
?x?
in
?df_train[
'credit_risk'
]]
#credit_risk = np.where(df_train['credit_risk'] == 'good',0,1)
data?
=
?df_train
data[
'credit_risk'
]
=
credit_risk
三、定量和定性指標篩選
? ? Python里面可以根據(jù)dtype對指標進行定量或者定性的區(qū)分,int64為定量指標,object則為定性指標,定量指標的篩選本文通過Python sklearn包中的f_regression進行單變量指標篩選,根據(jù)F檢驗值和P值來選擇入模變量
#獲取定量指標
quant_index?
=
?np.where(data.dtypes
=
=
'int64'
)
quant_vars?
=
?np.array(data.columns)[quant_index]
quant_vars?
=
?np.delete(quant_vars,
0
)

df_feature?
=
?pd.DataFrame(data,columns
=
[
'duration'
,
'amount'
,
'installment_rate'
,
'present_residence'
,
'age'
,
'number_credits'
,
'people_liable'
])
f_regression(df_feature,credit_risk)
#輸出逐步回歸后得到的變量,選擇P值<=0.1的變量
quant_model_vars?
=
?[
"duration"
,
"amount"
,
"age"
,
"installment_rate"
]

? ? 定性指標的篩選可通過計算IV值,并選擇IV值大于某一數(shù)值的條件來篩選指標,此處自己實現(xiàn)了WOE和IV的函數(shù)計算,本文選擇了IV值大于0.1的指標,算是比較高的IV值了,一般大于0.02就算是好變量
def
?woe(bad, good):
????
return
?np.log((bad
/
bad.
sum
())
/
(good
/
good.
sum
()))
?
?all_iv?
=
?np.empty(
len
(factor_vars))
woe_dict?
=
?dict
()?
#存起來后續(xù)有用
i?
=
?0
for
?var?
in
?factor_vars:
????
data_group?
=
?data.groupby(var)[
'credit_risk'
].agg([np.
sum
,
len
])
????
bad?
=
?data_group[
'sum'
]
????
good?
=
?data_group[
'len'
]
-
bad
????
woe_dict[var]?
=
?woe(bad,good)
????
iv?
=
?((bad
/
bad.
sum
()
-
good
/
good.
sum
())
*
woe(bad,good)).
sum
()
????
all_iv[i]?
=
?iv
????
i?
=
?i
+
1
high_index?
=
?np.where(all_iv>
0.1
)
qual_model_vars?
=
?factor_vars[high_index]

四、連續(xù)變量分段和離散變量降維
? ? 接下來對連續(xù)變量進行分段,由于R包有smbinning自動分箱函數(shù),Python沒有,只好采用別的方法進行自動分箱,從網(wǎng)上找了一個monotonic binning的Python實現(xiàn),本文進行了改造,增加分段時的排序和woe的計算,還支持手動分箱計算woe,具體代碼如下:
def
?binning(Y, X, n
=
None
):
????
# fill missings with median
????
X2?
=
?X.fillna(np.median(X))
????
if
?n?
=
=
?None
:
????????
r?
=
?0
????????
n?
=
?10
????????
while
?np.
abs
(r) <?
1
:
????????????
#d1 = pd.DataFrame({"X": X2, "Y": Y, "Bucket": pd.qcut(X2, n)})
????????????
d1?
=
?pd.DataFrame(
????????????????
{
"X"
: X2,?
"Y"
: Y,?
"Bucket"
: pd.qcut(X2.rank(method
=
'first'
), n)})
????????????
d2?
=
?d1.groupby(
'Bucket'
, as_index
=
True
)
????????????
r, p?
=
?stats.spearmanr(d2.mean().X, d2.mean().Y)
????????????
n?
=
?n?
-
?1
????
else
:
????????
d1?
=
?pd.DataFrame({
"X"
: X2,?
"Y"
: Y,?
"Bucket"
: pd.qcut(X2.rank(method
=
'first'
), n)})
????????
d2?
=
?d1.groupby(
'Bucket'
, as_index
=
True
)
????
d3?
=
?pd.DataFrame()
????
d3[
'min'
]?
=
?d2.
min
().X
????
d3[
'max'
]?
=
?d2.
max
().X
????
d3[
'bad'
]?
=
?d2.
sum
().Y
????
d3[
'total'
]?
=
?d2.count().Y
????
d3[
'bad_rate'
]?
=
?d2.mean().Y
????
d3[
'woe'
]?
=
?woe(d3[
'bad'
], d3[
'total'
]?
-
?d3[
'bad'
])
????
return
?d3

# duration
binning(data[
'credit_risk'
],data[
'duration'
])
duration_Cutpoint
=
list
()
duration_WoE
=
list
()
for
?x?
in
?data[
'duration'
]:
????
if
?x <
=
?12
:
????????
duration_Cutpoint.append(
'<= 12'
)
????????
duration_WoE.append(
-
0.488031
)
????
if
?x >?
12
?and
?x <
=
?24
:
????????
duration_Cutpoint.append(
'<= 24'
)
????????
duration_WoE.append(
-
0.109072
)
????
if
?x >?
24
:
????????
duration_Cutpoint.append(
'> 24'
)
????????
duration_WoE.append(
0.502560
)

# amount 手動分箱
binning(data[
'credit_risk'
],data[
'amount'
],
2
)
amount_Cutpoint
=
list
()
amount_WoE
=
list
()
for
?x?
in
?data[
'amount'
]:
????
if
?x <
=
?2315
:
????????
amount_Cutpoint.append(
'<= 2315'
)
????????
amount_WoE.append(
-
0.089829
)
????
if
?x >?
2315
:
????????
amount_Cutpoint.append(
'> 2315'
)
????????
amount_WoE.append(
0.086733
)

# age
binning(data[
'credit_risk'
],data[
'age'
])
age_Cutpoint
=
list
()
age_WoE
=
list
()
for
?x?
in
?data[
'age'
]:
????
if
?x <
=
?28
:
????????
age_Cutpoint.append(
'<= 28'
)
????????
age_WoE.append(
0.279209
)
????
if
?x >?
28
?and
?x <
=
?38
:
????????
age_Cutpoint.append(
'<= 38'
)
????????
age_WoE.append(
-
0.066791
)
????
if
?x >?
38
:
????????
age_Cutpoint.append(
'> 38'
)
????????
age_WoE.append(
-
0.241013
)

# installment_rate
binning(data[
'credit_risk'
],data[
'installment_rate'
])
nstallment_rate_Cutpoint
=
list
()
installment_rate_WoE
=
list
()
for
?x?
in
?data[
'installment_rate'
]:
????
if
?x <
=
?2
:
????????
installment_rate_Cutpoint.append(
'<= 2'
)
????????
installment_rate_WoE.append(
-
0.136411
)
????
if
?x >?
2
?and
?x <?
4
:
????????
installment_rate_Cutpoint.append(
'< 4'
)
????????
installment_rate_WoE.append(
-
0.130511
)
????
if
?x >
=
?4
:
????????
installment_rate_Cutpoint.append(
'>= 4'
)
????????
installment_rate_WoE.append(
0.248710
)
? 離散變量由于不同變量的維度不一致,為了防止“維數(shù)災難”,對多維對變量進行降維,在評級模型開發(fā)中的降維處理方法,通常是將屬性相似的合并處理,以達到降維的目的,本文采用參考文獻對做法進行合并降維。
#定性指標的降維和WoE
discrete_data?
=
?data[qual_model_vars]
discrete_data[
'credit_risk'
]
=
?data[
'credit_risk'
]
#對purpose指標進行降維
pd.value_counts(data[
'purpose'
])
#合并car(new)、car(used)
discrete_data[
'purpose'
]?
=
?discrete_data[
'purpose'
].replace(
'car (new)'
,?
'car(new/used)'
)
discrete_data[
'purpose'
]?
=
?discrete_data[
'purpose'
].replace(
'car (used)'
,?
'car(new/used)'
)
#合并radio/television、furniture/equipment
discrete_data[
'purpose'
]?
=
?discrete_data[
'purpose'
].replace(
'radio/television'
,?
'radio/television/furniture/equipment'
)
discrete_data[
'purpose'
]?
=
?discrete_data[
'purpose'
].replace(
'furniture/equipment'
,?
'radio/television/furniture/equipment'
)
#合并others、repairs、business
discrete_data[
'purpose'
]?
=
?discrete_data[
'purpose'
].replace(
'others'
,?
'others/repairs/business'
)
discrete_data[
'purpose'
]?
=
?discrete_data[
'purpose'
].replace(
'repairs'
,?
'others/repairs/business'
)
discrete_data[
'purpose'
]?
=
?discrete_data[
'purpose'
].replace(
'business'
,?
'others/repairs/business'
)
#合并retraining、education
discrete_data[
'purpose'
]?
=
?discrete_data[
'purpose'
].replace(
'retraining'
,?
'retraining/education'
)
discrete_data[
'purpose'
]?
=
?discrete_data[
'purpose'
].replace(
'education'
,?
'retraining/education'
)
data_group?
=
?discrete_data.groupby(
'purpose'
)[
'credit_risk'
].agg([np.
sum
,
len
])
bad?
=
?data_group[
'sum'
]
good?
=
?data_group[
'len'
]
-
bad
woe_dict[
'purpose'
]?
=
?woe(bad,good)
所有離散變量的分段和woe值如下:
?

##存儲所有離散變量的woe
#purpose
purpose_WoE
=
list
()
for
?x?
in
?discrete_data[
'purpose'
]:
????
for
?i?
in
?woe_dict[
'purpose'
].index:
????????
if
?x?
=
=
?i:
????????????
purpose_WoE.append(woe_dict[
'purpose'
][i])
#status
status_WoE
=
list
()
for
?x?
in
?discrete_data[
'status'
]:
????
for
?i?
in
?woe_dict[
'status'
].index:
????????
if
?x?
=
=
?i:
????????????
status_WoE.append(woe_dict[
'status'
][i])
#credit_history
credit_history_WoE
=
list
()
for
?x?
in
?discrete_data[
'credit_history'
]:
????
for
?i?
in
?woe_dict[
'credit_history'
].index:
????????
if
?x?
=
=
?i:
????????????
credit_history_WoE.append(woe_dict[
'credit_history'
][i])
#savings
savings_WoE
=
list
()
for
?x?
in
?discrete_data[
'savings'
]:
????
for
?i?
in
?woe_dict[
'savings'
].index:
????????
if
?x?
=
=
?i:
????????????
savings_WoE.append(woe_dict[
'savings'
][i])
#employment_duration
employment_duration_WoE
=
list
()
for
?x?
in
?discrete_data[
'employment_duration'
]:
????
for
?i?
in
?woe_dict[
'employment_duration'
].index:
????????
if
?x?
=
=
?i:
????????????
employment_duration_WoE.append(woe_dict[
'employment_duration'
][i])
#property
property_WoE
=
list
()
for
?x?
in
?discrete_data[
'property'
]:
????
for
?i?
in
?woe_dict[
'property'
].index:
????????
if
?x?
=
=
?i:
????????????
property_WoE.append(woe_dict[
'property'
][i])
轉載https://blog.csdn.net/kxiaozhuk/article/details/84612632
? ? 至此,就完成了數(shù)據(jù)集的準備、指標的篩選和分段降維,下面就要進行邏輯回歸模型的建模了
python信用評分卡(附代碼,博主錄制)
http://dwz.date/b62p
