In [2]:
# ライブラリをインポート

# データ操作と数値計算のため
import pandas as pd
import numpy as np

# データ可視化のため
import matplotlib.pyplot as plt
import seaborn as sns

# 統計モデリングと計量経済分析のため
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col

# for ignore warning
import warnings
warnings.filterwarnings('ignore')

In [35]:
# データの読み込み
url = "https://www.fbc.keio.ac.jp/~tyabu/keiryo/demography_data.csv"
df = pd.read_csv(url)

# ダミー変数の作成
df = pd.get_dummies(
    df,
    columns = ['region'],
    dtype = int,
    drop_first = True
)
region_dummies = [col for col in df.columns if col.startswith('region_')] # ダミー変数のカラム名を取得

# 変数の追加
df['gdp2015'] = np.log(df['rgdp2015'] / df['pop2015']) # 2015年の1人当たりGDPの対数
df['gdp1990'] = np.log(df['rgdp1990'] / df['pop1990']) # 1990年の1人当たりGDPの対数
df['d_gdp'] = df['gdp2015'] - df['gdp1990']            # 1990年から2015年にかけてのGDP変化率
df['d_depend'] = df['depend2015'] - df['depend1990']   # 高齢者割合の変化
df['l_pop1990'] = np.log(df['pop1990'])                # 1990年の人口の対数

df.head()

Unnamed: 0,country,pop1970,pop1990,rgdp1990,pop2015,rgdp2015,depend1990,depend2015,depend2015_ave,depend1990_ave,...,region_ECA,region_INL,region_LAC,region_MNA,region_SAS,gdp2015,gdp1990,d_gdp,d_depend,l_pop1990
0,Albania,2.150602,3.281453,13526.918,2.889676,28794.26,0.371767,0.765219,46.603516,40.800007,...,1,0,0,0,0,9.206787,8.324151,0.882636,0.393452,1.188286
1,Algeria,14.550033,25.912364,268930.19,38.934334,534905.63,0.283696,0.367006,41.229862,38.387451,...,0,0,0,1,0,9.527969,9.247487,0.280482,0.08331,3.25472
2,Angola,6.300969,11.12787,65542.75,24.227524,206743.03,0.265024,0.246253,37.734669,38.132797,...,0,0,0,0,0,9.051743,8.681005,0.370737,-0.018771,2.409453
3,Antigua and Barbuda,0.065369,0.061906,1053.9128,0.0909,1838.9391,0.379396,0.492705,43.998329,41.195862,...,0,0,1,0,0,9.914939,9.742403,0.172536,0.113308,-2.782138
4,Argentina,23.973062,32.72974,348658.31,42.980026,868209.31,0.562326,0.590606,45.300045,44.644901,...,0,0,1,0,0,9.913453,9.273564,0.639889,0.02828,3.488284


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Columns: 183 entries, pop1970 to region_SAS
dtypes: float64(9), int32(174)
memory usage: 126.9 KB


In [19]:
df.describe()

Unnamed: 0,pop1970,pop1990,rgdp1990,pop2015,rgdp2015,depend1990,depend2015,depend2015_ave,depend1990_ave,country_Algeria,...,region_ECA,region_INL,region_LAC,region_MNA,region_SAS,gdp2015,gdp1990,d_gdp,d_depend,l_pop1990
count,145.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,...,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0,169.0
mean,22.907531,30.728048,279514.9,41.895413,606233.8,0.409903,0.521218,43.096468,41.239505,0.005917,...,0.171598,0.153846,0.171598,0.106509,0.04142,9.260577,8.802822,0.457755,0.111315,1.762797
std,83.708112,114.243448,875166.1,149.076763,1988162.0,0.168005,0.278106,4.612848,3.27757,0.076923,...,0.378151,0.361873,0.378151,0.309405,0.199852,1.196047,1.211614,0.443817,0.151215,1.849155
min,0.052364,0.061906,275.1715,0.0909,632.0312,0.1041,0.11494,35.874977,36.199837,0.0,...,0.0,0.0,0.0,0.0,0.0,6.345704,5.89797,-0.574202,-0.18467,-2.782138
25%,1.03224,2.101156,8162.877,2.916798,25577.48,0.289938,0.283937,38.819458,38.811405,0.0,...,0.0,0.0,0.0,0.0,0.0,8.448749,7.8201,0.179731,-0.011646,0.742488
50%,4.505774,6.67392,33508.0,9.500422,76231.6,0.336027,0.42646,42.244106,39.821709,0.0,...,0.0,0.0,0.0,0.0,0.0,9.405314,8.943478,0.390405,0.081065,1.898207
75%,12.90476,17.478455,188970.7,29.469913,379616.8,0.520279,0.762153,47.555046,44.215298,0.0,...,0.0,0.0,0.0,0.0,0.0,10.100566,9.703582,0.658558,0.223096,2.860969
max,808.510713,1154.605773,9241600.0,1369.43567,17150540.0,0.810809,1.21674,53.256733,48.726349,1.0,...,1.0,1.0,1.0,1.0,1.0,12.00331,11.702765,2.994699,0.518942,7.051514


# 説明変数に人口比を用いた推定

In [23]:
endog = df['d_gdp']
exog = df['d_depend']
exog = sm.add_constant(exog)

mod_1 = sm.OLS(
    endog,
    exog
)
res_1 = mod_1.fit()
print(res_1.summary())

                            OLS Regression Results                            
Dep. Variable:                  d_gdp   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     2.211
Date:                Sat, 20 Jul 2024   Prob (F-statistic):              0.139
Time:                        18:12:42   Log-Likelihood:                -100.90
No. Observations:                 169   AIC:                             205.8
Df Residuals:                     167   BIC:                             212.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4204      0.042      9.942      0.0

In [24]:
endog = df['d_gdp']
exog = df[['d_depend', 'gdp1990']] # コントロール変数として、1990年のGDPを追加
exog = sm.add_constant(exog)

mod_2 = sm.OLS(
    endog,
    exog
)
res_2 = mod_2.fit()
print(res_2.summary())

                            OLS Regression Results                            
Dep. Variable:                  d_gdp   R-squared:                       0.132
Model:                            OLS   Adj. R-squared:                  0.121
Method:                 Least Squares   F-statistic:                     12.58
Date:                Sat, 20 Jul 2024   Prob (F-statistic):           8.15e-06
Time:                        18:13:23   Log-Likelihood:                -90.084
No. Observations:                 169   AIC:                             186.2
Df Residuals:                     166   BIC:                             195.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.6933      0.270      6.265      0.0

In [34]:
endog = df['d_gdp']
exog = df[['d_depend', 'gdp1990', 'l_pop1990', 'depend1990'] + region_dummies]
exog = sm.add_constant(exog)

mod_3 = sm.OLS(
    endog,
    exog
)
res_3 = mod_3.fit()
print(res_3.summary())

                            OLS Regression Results                            
Dep. Variable:                  d_gdp   R-squared:                       0.274
Model:                            OLS   Adj. R-squared:                  0.228
Method:                 Least Squares   F-statistic:                     5.959
Date:                Sat, 20 Jul 2024   Prob (F-statistic):           1.24e-07
Time:                        18:26:18   Log-Likelihood:                -74.973
No. Observations:                 169   AIC:                             171.9
Df Residuals:                     158   BIC:                             206.4
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.5944      0.331      4.816      0.0

# 練習問題
説明変数に平均年齢を用いた推定

In [36]:
df['d_depend2'] = df['depend2015_ave'] - df['depend1990_ave']

In [37]:
endog = df['d_gdp']
exog = df['d_depend2']
exog = sm.add_constant(exog)

mod_4 = sm.OLS(
    endog,
    exog
)
res_4 = mod_4.fit()
print(res_4.summary())

                            OLS Regression Results                            
Dep. Variable:                  d_gdp   R-squared:                       0.043
Model:                            OLS   Adj. R-squared:                  0.037
Method:                 Least Squares   F-statistic:                     7.432
Date:                Sat, 20 Jul 2024   Prob (F-statistic):            0.00709
Time:                        18:30:06   Log-Likelihood:                -98.334
No. Observations:                 169   AIC:                             200.7
Df Residuals:                     167   BIC:                             206.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3805      0.044      8.669      0.0

In [38]:
endog = df['d_gdp']
exog = df[['d_depend2', 'gdp1990', 'l_pop1990', 'depend1990_ave'] + region_dummies]
exog = sm.add_constant(exog)

mod_5 = sm.OLS(
    endog,
    exog
)
res_5 = mod_5.fit()
print(res_5.summary())

                            OLS Regression Results                            
Dep. Variable:                  d_gdp   R-squared:                       0.293
Model:                            OLS   Adj. R-squared:                  0.249
Method:                 Least Squares   F-statistic:                     6.561
Date:                Sat, 20 Jul 2024   Prob (F-statistic):           1.87e-08
Time:                        18:31:20   Log-Likelihood:                -72.666
No. Observations:                 169   AIC:                             167.3
Df Residuals:                     158   BIC:                             201.8
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              1.3910      0.722      1.