Pythonで線形回帰（重回帰） - ぺーぺーSEのブログ

線形回帰をPythonで解いてみる。
データセットは下記のBostonデータセットを使用する。

blog.pepese.com

単回帰は下記。

blog.pepese.com

3Dの描画については下記。

blog.pepese.com

ライブラリ読込、データセットのロードまでは以下。

import numpy as np
from sklearn import datasets
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy import linalg as LA
from sklearn import linear_model
# %pylab inline --no-import-all

boston = datasets.load_boston()
rooms = boston.data[:, 5]
criminals = boston.data[:, 0]
house_prices = boston.target

『%pylab inline --no-import-all』を入れておく事で、Jupyter Notebook上でグラフが表示できるようになるが、別窓にならないと3D回転できないのでコメントアウト。データの様子が見たい場合は以下。

fig = plt.figure() # figureオブジェクトを取得
ax = fig.gca(projection='3d') # 軸（ax）オブジェクトを3d指定で取得

# データセットの描画
ax.scatter3D(rooms, criminals, house_prices)
ax.set_xlabel("x_1:rooms")
ax.set_ylabel("x_2:criminals")
ax.set_zlabel("y:house_prices")

plt.show()

部屋の数と犯罪の数は、物件の価格と相関があるはず、という重回帰なモデルを考える。
ここでは、3D描画できるように説明変数は2つで行う。

ライブラリで解く

NumPyで解く

「numpy.linalg.lstsq」を使用して最小二乗法で解く。

X = np.array([rooms, criminals]) # 説明変数
y = house_prices # 目的変数
n = X.shape[1] # 次数の取得、[0]：行、[1]：列
X = np.vstack([np.ones(n), X]) # 説明変数に定数項x_0を付ける
coef = np.linalg.lstsq(X.T, y) # 演算部分
coef
# (array([-29.30168135,   8.3975317 ,  -0.2618229 ]),
#  array([ 19627.18158372]),
#  3,
#  array([ 219.52604791,  128.06307554,    2.38091887]))

numpy.linalg.lstsqの結果coefには下記が入っている。

coef[0]：最小二乗解p
coef[1]：残差の合計residues
coef[2]：係数行列thetaのランクrank
coef[3]：係数行列thetaのsingular value

下記で描画。

fig = plt.figure() # figureオブジェクトを取得
ax = fig.gca(projection='3d') # 軸（ax）オブジェクトを3d指定で取得

# データセットの描画
ax.scatter3D(rooms, criminals, house_prices)
ax.set_xlabel("x_1:rooms")
ax.set_ylabel("x_2:criminals")
ax.set_zlabel("y:house_prices")

# NumPyで解く
X = np.array([rooms, criminals]) # 説明変数
y = house_prices # 目的変数
n = X.shape[1] # 次数の取得
X = np.vstack([np.ones(n), X]) # 説明変数に定数項を付ける
coef = np.linalg.lstsq(X.T, y) # 演算部分
# 結果の取得
theta_0, theta_1, theta_2 = coef[0]

# プロットする各軸の点を作成
x_0 = 1
x_1 = np.arange(3, 10, 1)
x_2 = np.arange(-20, 100, 1)
ax_1, ax_2 = np.meshgrid(x_1, x_2)

# 重回帰分析の結果
y = theta_0 * x_0 + theta_1 * ax_1 + theta_2 * ax_2

# 平面をワイヤーフレーム形式で描画
ax.plot_wireframe(ax_1, ax_2, y)

plt.show()

numpy.linalgパッケージ含めNumPyのリファレンスは以下。
http://docs.scipy.org/doc/numpy/reference/

SciPyで解く

「scipy.linalg.lstsq」を使用して最小二乗法で解く。

NumPyの「numpy.linalg」パッケージに＋α機能を付けたものが、SciPyの「scipy.linalg」パッケージになっている。重複する関数の使い方は同じ（たぶん）。
なお、「linalg」は「linear algebra」の省略で「線形代数」の意。

X = np.array([rooms, criminals]) # 説明変数
y = house_prices # 目的変数
n = X.shape[1] # 次数の取得
X = np.vstack([np.ones(n), X]) # 説明変数に定数項x_0を付ける
coef = LA.lstsq(X.T, y) # 演算部分
coef
# (array([-29.30168135,   8.3975317 ,  -0.2618229 ]),
#  19627.181583724781,
#  3,
#  array([ 219.52604791,  128.06307554,    2.38091887]))

scipy.linalg.lstsqの結果coefには下記が入っている。（NumPyのときと同じ）

coef[0]：最小二乗解p
coef[1]：残差の合計residues
coef[2]：係数行列thetaのランクrank
coef[3]：係数行列thetaのsingular value

下記で描画する。

fig = plt.figure() # figureオブジェクトを取得
ax = fig.gca(projection='3d') # 軸（ax）オブジェクトを3d指定で取得

# データセットの描画
ax.scatter3D(rooms, criminals, house_prices)
ax.set_xlabel("x_1:rooms")
ax.set_ylabel("x_2:criminals")
ax.set_zlabel("y:house_prices")

# SciPyで解く
X = np.array([rooms, criminals]) # 説明変数
y = house_prices # 目的変数
n = X.shape[1] # 次数の取得
X = np.vstack([np.ones(n), X]) # 説明変数に定数項を付ける
coef = LA.lstsq(X.T, y) # 演算部分
# 結果の取得
theta_0, theta_1, theta_2 = coef[0]

# プロットする各軸の点を作成
x_0 = 1
x_1 = np.arange(3, 10, 1)
x_2 = np.arange(-20, 100, 1)
ax_1, ax_2 = np.meshgrid(x_1, x_2)

# 重回帰分析の結果
y = theta_0 * x_0 + theta_1 * ax_1 + theta_2 * ax_2

# 平面をワイヤーフレーム形式で描画
ax.plot_wireframe(ax_1, ax_2, y)

plt.show()

scipy.linalgパッケージ含めSciPyのリファレンスは以下。
http://docs.scipy.org/doc/scipy/reference/

scikit-learnで解く

「sklearn.linear_model.LinearRegression」を使用して最小二乗法で解く。

boston_df = pd.DataFrame(boston.data)
boston_df.columns = boston.feature_names
X = pd.DataFrame() # 説明変数
X['RM'] = boston_df['RM']
X['CRIM'] = boston_df['CRIM']
y = house_prices # 目的変数
clf = linear_model.LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)
clf.fit(X, y) # 演算部分

clf.intercept_, clf.coef_
# (array([ 8.3975317, -0.2618229]), -29.30168134674113)

clf.coef_: 回帰係数
clf.intercept_: 切片（誤差）
clf.score(X, Y): 決定係数

描画は下記。

fig = plt.figure() # figureオブジェクトを取得
ax = fig.gca(projection='3d') # 軸（ax）オブジェクトを3d指定で取得

# データセットの描画
ax.scatter3D(rooms, criminals, house_prices)
ax.set_xlabel("x_1:rooms")
ax.set_ylabel("x_2:criminals")
ax.set_zlabel("y:house_prices")

# scikit-learnで解く
boston_df = pd.DataFrame(boston.data)
boston_df.columns = boston.feature_names
X = pd.DataFrame() # 説明変数
X['RM'] = boston_df['RM']
X['CRIM'] = boston_df['CRIM']
y = house_prices # 目的変数
clf = linear_model.LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)
clf.fit(X, y) # 演算部分
# 重回帰分析の結果
theta_0 = clf.intercept_
theta_1, theta_2 = clf.coef_

# プロットする各軸の点を作成
x_0 = 1
x_1 = np.arange(3, 10, 1)
x_2 = np.arange(-20, 100, 10)
ax_1, ax_2 = np.meshgrid(x_1, x_2)

# 重回帰分析の結果
y = theta_0 * x_0 + theta_1 * ax_1 + theta_2 * ax_2

# 平面をワイヤーフレーム形式で描画
ax.plot_wireframe(ax_1, ax_2, y)

plt.show()

ライブラリを使わず自力で解く

CourseraのMachine Learningに沿った方法で解く。仮説（Hypothesis）は下記。

${ \displaystyle h_\theta(x) = \theta_0 x_0 + \theta_1 x_1 + \theta_2 x_2 }$

ただし、 $x_0 = 1$ 。
上記に従って、データセットから $x$ 、 $y$ 、 $\theta$ を作成。

# x_0 = 1を設定
x_0 = np.ones(len(rooms)) # roomsはx_1なので、同じ数だけx_0 = 1を作る
x_1 = rooms
x_2 = criminals
X = np.column_stack((x_0, x_1, x_2)) # x_0とx_1の行列作成

y = house_prices.reshape(len(house_prices), 1) # 「reshape」で「1×n」→「n×1」変換

# θは「theta_0 = 0, theta_1 = 0, theta_2 = 0」で初期化
theta = np.matrix(np.array([0, 0, 0]))

X.shape, theta.shape, y.shape
# ((506, 3), (1, 3), (506, 1))

また、コスト関数の数式および実装は下記。

$J(\theta) = \frac{1}{2m} \sum_{i=1}^m (h(x^{(i)}) - y^{(i)})^{2}$

$m$ はデータセット数（ここでは、506）、 $i$ はデータセットの番号（何番目か、1～506）。

def computeCost(X, y, theta):
    inner = np.power(((X * theta.T) - y), 2)
    return np.sum(inner) / (2 * len(X))

勾配降下法の数式および実装は下記。

$\theta_j = \theta_j - \alpha \frac{\partial J}{\partial \theta_j}$

$\frac{\partial J}{\partial \theta_j} = \frac{1}{m} \sum_{i=1}^m (h(x^{(i)}) - y^{(i)}) x_j^{(i)}$

$j$ はパラメータの番号（何番目の $x_j$ や $\theta_j$ か、 $0 \leqq j \leqq 1$ 、パラメータ2個しかないから0番目か1番目）。

def gradientDescent(X, y, theta, alpha, iters):
    temp = np.matrix(np.zeros(theta.shape))
    parameters = int(theta.ravel().shape[1])
    cost = np.zeros(iters)

    for i in range(iters):
        # h(x) - y
        error = (X * theta.T) - y

        for j in range(parameters):
            # (h(x) - y) * x_j
            term = np.multiply(error, X[:,j])
            # - α 1/m Σ (h(x) - y) * x_j
            temp[0,j] = theta[0,j] - ((alpha / len(X)) * np.sum(term))

        theta = temp
        cost[i] = computeCost(X, y, theta)

    return theta, cost

求める。

# αを0.00001とおく
alpha = 0.00001
# 勾配降下法のループを1000とおく
iters = 1000

# 勾配降下法の実行
g, cost = gradientDescent(X, y, theta, alpha, iters)
g
# matrix([[ 0.42186987,  2.65129792,  1.51609938]])

最初、 $\alpha = 0.01$ でやっていたが発散してしまったので小さくした。
下記で描画する。

fig = plt.figure() # figureオブジェクトを取得
ax = fig.gca(projection='3d') # 軸（ax）オブジェクトを3d指定で取得

# データセットの描画
ax.scatter3D(rooms, criminals, house_prices)
ax.set_xlabel("x_1:rooms")
ax.set_ylabel("x_2:criminals")
ax.set_zlabel("y:house_prices")

# 結果の取得
theta_0 = g[0, 0]
theta_1 = g[0, 1]
theta_2 = g[0, 2]

# プロットする各軸の点を作成
x_0 = 1
x_1 = np.arange(3, 10, 1)
x_2 = np.arange(-20, 100, 10)
ax_1, ax_2 = np.meshgrid(x_1, x_2)

# 重回帰分析の結果
y = theta_0 * x_0 + theta_1 * ax_1 + theta_2 * ax_2

# 平面をワイヤーフレーム形式で描画
ax.plot_wireframe(ax_1, ax_2, y)


# scikit-learnで解いた結果を赤で描画
#(array([ 8.3975317, -0.2618229]), -29.30168134674113)
xx_0 = 1
xx_1 = np.arange(3, 10, 1)
xx_2 = np.arange(-20, 100, 10)
axx_1, axx_2 = np.meshgrid(xx_1, xx_2)
yy = -29.30168134674113 * xx_0 + 8.3975317 * axx_1 - 0.2618229 * axx_2
ax.plot_wireframe(axx_1, axx_2, yy, color='r')

plt.show()