協作閣

開源協作部落格

Applying logistic regression for classifying human-machine dialogue and human-human dialogue (Part II)

Jessica / 2019-05-16 /


上上週用24個連續變數(continuous variables) (像是Different Words, Sentence Length Average,Function Word proportion…etc) 來分類人人對話及人機對話。這禮拜則嘗試看看用類別變數(categorical variables) 來分類。首先,必須將這些類別變數轉變為dummy variables(虛擬變數)。

from sklearn import preprocessing, linear_model
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression
plt.style.use('ggplot')
plt.rcParams['font.family']='SimHei' #⿊體
data = pd.read_excel('sample_50_anno.xlsx',sheet_name = None) 
DAdata=data.get('DA_tp') # get a specific sheet to DataFrame

這次的類別變數是11個dialogue act type。資料還是一樣是上週的100筆人人對話及人機對話(50筆人人,50筆人機)。每組對話都經過我人工標記,屬於哪個dialogue act type就在下面標1,沒有則標0。

下面的資料是dummy後的資料。

df=DAdata[['starter','Inf','Ans','offer','Sug','Req','Chk.Q','Pro.Q','Set.Q','Autopositive','Machine']]
df.head()
starter Inf Ans offer Sug Req Chk.Q Pro.Q Set.Q Autopositive Machine
0 1 0 0 0 0 0 0 0 0 0 0
1 0 0 1 0 0 0 0 0 0 0 0
2 0 0 0 0 0 1 0 0 0 0 0
3 0 0 1 0 0 0 0 0 0 0 0
4 1 0 0 0 0 0 0 0 0 0 0
x=df[['starter','Inf','Ans','offer','Sug','Req','Chk.Q','Pro.Q','Set.Q','Autopositive']]
y=df[['Machine']]

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=2021) 

x_train
starter Inf Ans offer Sug Req Chk.Q Pro.Q Set.Q Autopositive
173 0 1 0 0 0 0 0 0 1 0
112 0 0 0 0 0 0 0 0 1 0
179 0 1 0 0 0 0 0 0 0 0
134 1 0 0 0 0 0 0 0 0 0
180 0 0 0 0 0 0 0 1 0 0
66 0 0 0 0 1 0 0 0 0 0
34 1 0 0 0 0 1 0 0 0 0
169 0 1 0 0 0 0 0 0 0 0
77 1 0 0 0 0 0 0 0 0 0
45 0 0 0 0 0 0 0 0 1 0
6 1 0 0 0 0 0 0 0 0 0
163 1 0 0 0 0 0 0 0 0 0
30 1 0 0 0 0 0 0 0 0 0
73 1 0 0 0 0 0 0 0 0 0
65 1 0 0 0 0 1 0 0 0 0
68 0 1 0 0 0 0 0 0 0 0
170 1 0 0 0 0 0 0 0 0 0
75 0 0 0 0 0 0 0 1 0 0
12 1 0 0 0 0 0 0 0 0 0
130 0 0 0 0 0 0 0 1 0 0
104 1 0 0 0 0 0 0 0 0 0
81 0 0 0 0 0 1 0 0 0 0
86 0 1 0 0 0 0 0 0 0 0
69 0 0 0 0 0 0 0 1 0 0
139 0 0 1 0 0 0 0 0 1 0
177 0 0 1 0 0 0 0 0 0 0
202 0 0 0 0 0 0 0 0 1 0
88 0 0 0 0 0 0 0 0 1 0
145 0 0 0 0 0 0 0 0 1 0
98 0 0 1 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ...
195 0 1 0 0 0 0 0 0 0 0
123 0 0 0 0 0 0 0 0 1 0
54 0 1 0 0 0 0 0 0 0 0
178 1 0 0 0 0 0 0 0 0 0
144 1 0 0 0 0 0 0 0 0 0
49 1 0 0 0 0 1 0 0 0 0
63 0 0 0 0 0 0 0 0 1 0
176 0 0 0 0 0 0 0 0 1 0
154 0 0 1 0 0 0 0 0 0 0
124 0 0 0 0 0 0 0 0 1 0
110 0 0 0 0 0 0 0 0 1 0
1 0 0 1 0 0 0 0 0 0 0
7 0 0 0 0 0 0 0 0 1 0
101 0 1 0 0 0 0 0 0 0 0
33 0 1 0 0 0 0 0 0 0 0
70 0 0 1 0 0 0 0 0 0 0
102 1 0 0 0 0 0 0 0 0 0
140 0 0 0 0 0 0 0 1 0 0
152 0 0 1 0 0 0 0 0 0 0
93 0 0 0 0 0 0 0 1 0 0
21 0 0 0 0 0 0 0 0 0 1
157 1 0 0 0 0 0 0 0 0 0
62 0 0 1 0 0 0 0 0 0 0
44 0 1 0 0 0 0 0 0 0 0
94 0 1 0 0 0 0 0 0 0 0
109 0 0 1 0 0 0 0 0 0 0
128 0 0 0 0 0 0 0 1 0 0
57 0 0 0 0 0 0 0 0 1 0
85 0 0 0 0 0 0 1 0 0 0
116 0 0 0 0 0 0 0 1 0 0

144 rows × 10 columns

from sklearn.preprocessing  import StandardScaler
sc=StandardScaler()

sc.fit(x_train)

x_train_nor=sc.transform(x_train)
x_test_nor=sc.transform(x_test)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/preprocessing/data.py:645: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:6: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.

/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/ipykernel_launcher.py:7: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  import sys
from sklearn.linear_model  import LogisticRegression
import math
lr=LogisticRegression()
lr.fit(x_train_nor,y_train)

# 印出係數
print(lr.coef_)
#印出10個檢定變數的顯著性,以 P-value 是否小於 0.05(信心水準 95%)來判定
print(f_regression(x_train_nor,y_train)[1])
# 印出截距
print(lr.intercept_ )
[[-0.08006139 -0.44116689  0.08343295  0.         -0.45971731 -0.83366503
  -0.22135901 -0.09189166  0.35124534 -0.08632877]]
[0.50361434 0.01848812 0.24974631        nan 0.13953905 0.00446218
 0.27466972 0.8810979  0.00176625 0.95305294]
[-0.00500389]


/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/utils/validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/utils/validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/feature_selection/univariate_selection.py:299: RuntimeWarning: invalid value encountered in true_divide
  corr /= X_norms
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scipy/stats/_distn_infrastructure.py:877: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scipy/stats/_distn_infrastructure.py:877: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scipy/stats/_distn_infrastructure.py:1831: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)
np.round(lr.predict_proba(x_test_nor),3)
array([[0.461, 0.539],
       [0.498, 0.502],
       [0.684, 0.316],
       [0.236, 0.764],
       [0.367, 0.633],
       [0.684, 0.316],
       [0.461, 0.539],
       [0.367, 0.633],
       [0.367, 0.633],
       [0.684, 0.316],
       [0.236, 0.764],
       [0.732, 0.268],
       [0.367, 0.633],
       [0.684, 0.316],
       [0.236, 0.764],
       [0.684, 0.316],
       [0.461, 0.539],
       [0.684, 0.316],
       [0.461, 0.539],
       [0.367, 0.633],
       [0.684, 0.316],
       [0.367, 0.633],
       [0.461, 0.539],
       [0.236, 0.764],
       [0.236, 0.764],
       [0.367, 0.633],
       [0.498, 0.502],
       [0.972, 0.028],
       [0.991, 0.009],
       [0.981, 0.019],
       [0.973, 0.027],
       [0.598, 0.402],
       [0.461, 0.539],
       [0.236, 0.764],
       [0.367, 0.633],
       [0.684, 0.316],
       [0.236, 0.764],
       [0.367, 0.633],
       [0.367, 0.633],
       [0.236, 0.764],
       [0.461, 0.539],
       [0.684, 0.316],
       [0.461, 0.539],
       [0.461, 0.539],
       [0.684, 0.316],
       [0.367, 0.633],
       [0.236, 0.764],
       [0.972, 0.028],
       [0.461, 0.539],
       [0.684, 0.316],
       [0.498, 0.502],
       [0.416, 0.584],
       [0.973, 0.027],
       [0.498, 0.502],
       [0.461, 0.539],
       [0.236, 0.764],
       [0.684, 0.316],
       [0.639, 0.361],
       [0.723, 0.277],
       [0.684, 0.316],
       [0.236, 0.764],
       [0.236, 0.764]])
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
from sklearn.metrics import confusion_matrix
cnf=confusion_matrix(y_test, lr.predict(x_test_nor))
print('混淆矩陣:', cnf)
混淆矩陣: [[12 20]
 [11 19]]
import itertools
target_name=['yes','no']
plot_confusion_matrix(cnf,classes=target_name,title='confusion matrix')
plt.show()
Confusion matrix, without normalization
[[12 20]
 [11 19]]


/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/matplotlib/font_manager.py:1241: UserWarning: findfont: Font family ['SimHei'] not found. Falling back to DejaVu Sans.
  (prop.get_family(), self.defaultFamily[fontext]))

png

accuracy=(12+19)/(12+19+11+20)
print(accuracy)
0.5
precision=19/39
print(precision)
0.48717948717948717
#覆蓋率或者靈敏度
recall=19/30
print(recall)
0.6333333333333333
F1=0.61709401709/1.12051282051
print(F1)
0.5507246376789607

看來效果沒有上次好……

Reference * Scikit-learn——Naive Bayes