mayo 30, 2017

Regresión múltiple


Encontrarás explicación sobre selección de modelo en regresión con más de dos variables independientes en páginas 226 – 231, Experimentación en Agricultura.


plot(todos, scale = "Cp"   , main = "Cp de Mallows")



El Cp con el menor valor y el R2-ajustado con el mayor, sirven para seleccionar el modelo con las variables más apropiadas.



Códigos para el software R

#***********************************************************
# EXPERIMENTACIÓN EN AGRICULTURA                              
# Fernández Escobar, R.; Trapero, A.; Domínguez, J. 2010       
# CAPÍTULO 16: REGRESIÓN MÚLTIPLE
# REGRESIÓN CON MÁS DE DOS VARIABLES INDEPENDIENTES (pp. 226-231)
#************************************************************

#### Importar Tabla 16.2
SUELO <- read.csv("C:/Users/Administrator/Desktop/tabla 16.2.csv", header=TRUE)
attach(SUELO); knitr::kable(SUELO)
N_MINERALTRATGROSORPROF
26.3400.520
28.2500.530
26.2400.550
37.2800.570
42.0400.590
21.1501.020
34.3801.030
26.6701.050
28.3901.070
40.5001.090
35.5302.020
22.3002.030
20.7502.050
20.3502.070
47.3202.090
44.7510.520
74.1610.530
62.1110.550
57.4110.570
60.7810.590
41.5411.020
61.1011.030
63.6711.050
62.3111.070
65.2111.090
51.8012.020
74.0412.030
60.6912.050
56.5212.070
69.2312.090
#### El primer procedimiento: Best Subset Regression Models for N_MINERAL
library(leaps)
library(HH)
todos <- regsubsets(N_MINERAL ~ PROF + TRAT + GROSOR, data = SUELO, nbest=10)


# Posibles modelos representados por asteriscos
summary(todos); stars <- summary(todos); as.data.frame(stars$outmat) 
## Subset selection object
## Call: regsubsets.formula(N_MINERAL ~ PROF + TRAT + GROSOR, data = SUELO, 
##     nbest = 10)
## 3 Variables  (and intercept)
##        Forced in Forced out
## PROF       FALSE      FALSE
## TRAT       FALSE      FALSE
## GROSOR     FALSE      FALSE
## 10 subsets of each size up to 3
## Selection Algorithm: exhaustive
##          PROF TRAT GROSOR
## 1  ( 1 ) " "  "*"  " "   
## 1  ( 2 ) "*"  " "  " "   
## 1  ( 3 ) " "  " "  "*"   
## 2  ( 1 ) "*"  "*"  " "   
## 2  ( 2 ) " "  "*"  "*"   
## 2  ( 3 ) "*"  " "  "*"   
## 3  ( 1 ) "*"  "*"  "*"
##          PROF TRAT GROSOR
## 1  ( 1 )         *       
## 1  ( 2 )    *            
## 1  ( 3 )                *
## 2  ( 1 )    *    *       
## 2  ( 2 )         *      *
## 2  ( 3 )    *           *
## 3  ( 1 )    *    *      *
# Opción 1
knitr::kable(summaryHH(todos))
modelprsqrssadjr2cpbicstderr
T20.75173222207.8130.74286556.825148-34.9950228.879779
P20.05159268434.0640.017720999.3952895.21326217.355592
G20.00002838892.618-0.0356849106.2129406.80154417.821154
P-T30.80332481749.0070.78875622.003748-38.5824528.048481
T-G30.75176052207.5610.73337248.821400-31.5972519.042208
P-G30.05162098433.812-0.0186294101.3915418.61356317.673806
P-T-G40.80335311748.7550.78066314.000000-35.1855798.201208
# Opción 2, ordenados según página 228. No incluye el intercepto
knitr::kable(data.frame(summary(todos)$cp, summary(todos)$adjr2, summary(todos)$rsq, summary(todos)$rss))
summary.todos..cpsummary.todos..adjr2summary.todos..rsqsummary.todos..rss
6.8251480.74286550.75173222207.813
99.3952890.01772090.05159268434.064
106.212940-0.03568490.00002838892.618
2.0037480.78875620.80332481749.007
8.8214000.73337240.75176052207.561
101.391541-0.01862940.05162098433.812
4.0000000.78066310.80335311748.755
# Gráficos de diagnósticos de "Best Subset Regression Models for N_MINERAL"
plot(todos, scale = "r2"   , main = "Coeficiente de Determinación R^2")
plot(todos, scale = "Cp"   , main = "Cp de Mallows")
plot(todos, scale = "adjr2", main = "R^2 ajustado")
plot(todos, scale = "bic"  , main = "Criterio de información bayesiano (BIC)")
#### El segundo procedimiento
# Selección de "mejor" modelo, usando procedimientos "Forward" y "Backward"
# Opción 1
# Backward
modelo.completo <- lm(N_MINERAL ~ PROF + TRAT + GROSOR, data = SUELO) # regresión con todas las variables en SUELO
modelo.reducido <- step(modelo.completo, direction = "backward")
## Start:  AIC=129.96
## N_MINERAL ~ PROF + TRAT + GROSOR
## 
##          Df Sum of Sq    RSS    AIC
## - GROSOR  1       0.3 1749.0 127.97
## <none>                1748.8 129.96
## - PROF    1     458.8 2207.6 134.95
## - TRAT    1    6685.1 8433.8 175.16
## 
## Step:  AIC=127.97
## N_MINERAL ~ PROF + TRAT
## 
##        Df Sum of Sq    RSS    AIC
## <none>              1749.0 127.97
## - PROF  1     458.8 2207.8 132.96
## - TRAT  1    6685.1 8434.1 173.16
summary(modelo.reducido); modelo.reducido$anova
## 
## Call:
## lm(formula = N_MINERAL ~ PROF + TRAT, data = SUELO)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.9287  -4.8533  -0.0762   4.0864  17.1644 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 22.55961    3.63579   6.205 1.24e-06 ***
## PROF         0.15269    0.05737   2.661   0.0129 *  
## TRAT        29.85533    2.93889  10.159 1.01e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.048 on 27 degrees of freedom
## Multiple R-squared:  0.8033, Adjusted R-squared:  0.7888 
## F-statistic: 55.14 on 2 and 27 DF,  p-value: 2.922e-10
##       Step Df Deviance Resid. Df Resid. Dev      AIC
## 1          NA       NA        26   1748.755 129.9639
## 2 - GROSOR  1 0.252105        27   1749.007 127.9682
# Forward
modelo.fwd <- step(lm(N_MINERAL ~1), direction = "forward", scope = (~PROF + TRAT + GROSOR))
## Start:  AIC=172.75
## N_MINERAL ~ 1
## 
##          Df Sum of Sq    RSS    AIC
## + TRAT    1    6685.1 2207.8 132.96
## <none>                8892.9 172.75
## + PROF    1     458.8 8434.1 173.16
## + GROSOR  1       0.3 8892.6 174.75
## 
## Step:  AIC=132.96
## N_MINERAL ~ TRAT
## 
##          Df Sum of Sq    RSS    AIC
## + PROF    1    458.81 1749.0 127.97
## <none>                2207.8 132.96
## + GROSOR  1      0.25 2207.6 134.95
## 
## Step:  AIC=127.97
## N_MINERAL ~ TRAT + PROF
## 
##          Df Sum of Sq    RSS    AIC
## <none>                1749.0 127.97
## + GROSOR  1   0.25211 1748.8 129.96
summary(modelo.fwd); modelo.fwd$anova
## 
## Call:
## lm(formula = N_MINERAL ~ TRAT + PROF)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.9287  -4.8533  -0.0762   4.0864  17.1644 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 22.55961    3.63579   6.205 1.24e-06 ***
## TRAT        29.85533    2.93889  10.159 1.01e-10 ***
## PROF         0.15269    0.05737   2.661   0.0129 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.048 on 27 degrees of freedom
## Multiple R-squared:  0.8033, Adjusted R-squared:  0.7888 
## F-statistic: 55.14 on 2 and 27 DF,  p-value: 2.922e-10
##     Step Df  Deviance Resid. Df Resid. Dev      AIC
## 1        NA        NA        29   8892.870 172.7542
## 2 + TRAT -1 6685.0570        28   2207.813 132.9568
## 3 + PROF -1  458.8061        27   1749.007 127.9682
# Opción 2
# Usando Cp en regresión Stepwise
modelo.completo <- lm(N_MINERAL ~., data=SUELO)
MSE=(summary(modelo.completo)$sigma)^2 # guardar MSE para el modelo completo
intercepto <- lm(N_MINERAL~1) # regresión incluye solo la constante del intercepto
modelos <- step(intercepto, scope = list(upper = modelo.completo), scale = MSE); modelos
## Start:  AIC=104.22
## N_MINERAL ~ 1
## 
##          Df Sum of Sq    RSS       Cp
## + TRAT    1    6685.1 2207.8   6.8251
## + PROF    1     458.8 8434.1  99.3953
## <none>                8892.9 104.2167
## + GROSOR  1       0.3 8892.6 106.2129
## 
## Step:  AIC=6.83
## N_MINERAL ~ TRAT
## 
##          Df Sum of Sq    RSS       Cp
## + PROF    1     458.8 1749.0   2.0037
## <none>                2207.8   6.8251
## + GROSOR  1       0.3 2207.6   8.8214
## - TRAT    1    6685.1 8892.9 104.2167
## 
## Step:  AIC=2
## N_MINERAL ~ TRAT + PROF
## 
##          Df Sum of Sq    RSS      Cp
## <none>                1749.0  2.0037
## + GROSOR  1       0.3 1748.8  4.0000
## - PROF    1     458.8 2207.8  6.8251
## - TRAT    1    6685.1 8434.1 99.3953
## 
## Call:
## lm(formula = N_MINERAL ~ TRAT + PROF)
## 
## Coefficients:
## (Intercept)         TRAT         PROF  
##     22.5596      29.8553       0.1527
summary(modelos); modelos$anova
## 
## Call:
## lm(formula = N_MINERAL ~ TRAT + PROF)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.9287  -4.8533  -0.0762   4.0864  17.1644 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 22.55961    3.63579   6.205 1.24e-06 ***
## TRAT        29.85533    2.93889  10.159 1.01e-10 ***
## PROF         0.15269    0.05737   2.661   0.0129 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.048 on 27 degrees of freedom
## Multiple R-squared:  0.8033, Adjusted R-squared:  0.7888 
## F-statistic: 55.14 on 2 and 27 DF,  p-value: 2.922e-10
##     Step Df  Deviance Resid. Df Resid. Dev         Cp
## 1        NA        NA        29   8892.870 104.216688
## 2 + TRAT -1 6685.0570        28   2207.813   6.825148
## 3 + PROF -1  458.8061        27   1749.007   2.003748
#### Modelo final
modelo.final <- lm(N_MINERAL ~ PROF + TRAT, data = SUELO)
anova(modelo.final); summary(modelo.final)
## Analysis of Variance Table
## 
## Response: N_MINERAL
##           Df Sum Sq Mean Sq  F value    Pr(>F)    
## PROF       1  458.8   458.8   7.0827   0.01294 *  
## TRAT       1 6685.1  6685.1 103.1994 1.012e-10 ***
## Residuals 27 1749.0    64.8                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call:
## lm(formula = N_MINERAL ~ PROF + TRAT, data = SUELO)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.9287  -4.8533  -0.0762   4.0864  17.1644 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 22.55961    3.63579   6.205 1.24e-06 ***
## PROF         0.15269    0.05737   2.661   0.0129 *  
## TRAT        29.85533    2.93889  10.159 1.01e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.048 on 27 degrees of freedom
## Multiple R-squared:  0.8033, Adjusted R-squared:  0.7888 
## F-statistic: 55.14 on 2 and 27 DF,  p-value: 2.922e-10
#  Diagnósticos de modelo final
library(broom)
tidy(modelo.final); glance(modelo.final)
##          term  estimate  std.error statistic      p.value
## 1 (Intercept) 22.559610 3.63578542  6.204879 1.235274e-06
## 2        PROF  0.152687 0.05737219  2.661342 1.294477e-02
## 3        TRAT 29.855333 2.93888984 10.158711 1.011703e-10
##   r.squared adj.r.squared    sigma statistic      p.value df    logLik
## 1 0.8033248     0.7887562 8.048481  55.14108 2.921599e-10  3 -103.5523
##        AIC      BIC deviance df.residual
## 1 215.1045 220.7093 1749.007          27
knitr::kable(augment(modelo.final))
N_MINERALPROFTRAT.fitted.se.fit.resid.hat.sigma.cooksd.std.resid
26.3420025.613352.7729230.72665040.11869928.2003950.00041520.0961722
28.2530027.140222.4313901.10978050.09126028.1986210.00070040.1446450
26.2450030.193962.081274-3.95395930.06686998.1624220.0061782-0.5085658
37.2870033.247702.3205614.03230080.08313018.1601130.00827370.5232209
42.0490036.301443.0119035.73856100.14004078.1115150.03208890.7688650
21.1520025.613352.772923-4.46334960.11869928.1486260.0156665-0.5907244
34.3830027.140222.4313907.23978050.09126028.0654280.02980600.9436081
26.6750030.193962.081274-3.52395930.06686998.1705360.0049075-0.4532584
28.3970033.247702.320561-4.85769920.08313018.1412310.0120076-0.6303225
40.5090036.301443.0119034.19856100.14004078.1535950.01717710.5625324
35.5320025.613352.7729239.91665040.11869927.9358540.07733571.3124688
22.3030027.140222.431390-4.84021950.09126028.1411280.0133224-0.6308576
20.7550030.193962.081274-9.44395930.06686997.9745450.0352456-1.2147001
20.3570033.247702.320561-12.89769920.08313017.7647480.0846483-1.6735722
47.3290036.301443.01190311.01856100.14004077.8638110.11830391.4762910
44.7520155.468682.772923-10.71868290.11869927.8902160.0903510-1.4186178
74.1630156.995552.43139017.16444720.09126027.4027080.16753772.2371551
62.1150160.049292.0812742.06070730.06686998.1911220.00167810.2650521
57.4170163.103032.320561-5.69303250.08313018.1184930.0164923-0.7387132
60.7890166.156773.011903-5.37677240.14004078.1225940.0281704-0.7203918
41.5420155.468682.772923-13.92868290.11869927.6682890.1525704-1.8434613
61.1030156.995552.4313904.10444720.09126028.1582170.00957990.5349596
63.6750160.049292.0812743.62070730.06686998.1687920.00518060.4657023
62.3170163.103032.320561-0.79303250.08313018.2001910.0003200-0.1029019
65.2190166.156773.011903-0.94677240.14004078.1993550.0008735-0.1268506
51.8020155.468682.772923-3.66868290.11869928.1659130.0105845-0.4855502
74.0430156.995552.43139017.04444720.09126027.4144340.16520332.2215147
60.6950160.049292.0812740.64070730.06686998.2007680.00016220.0824090
56.5270163.103032.320561-6.58303250.08313018.0902170.0220519-0.8541973
69.2390166.156773.0119033.07322760.14004078.1760080.00920320.4117578
# Intervalo de confianza (95%) y predicción
intervalo  <- predict(modelo.final, interval = "confidence")
prediccion <- predict(modelo.final, interval = "prediction")
knitr::kable(data.frame(N_MINERAL, intervalo, prediccion))
N_MINERALfitlwruprfit.1lwr.1upr.1
26.3425.6133519.9237831.3029225.613358.14660043.08010
28.2527.1402222.1514232.1290227.140229.88900944.39143
26.2430.1939625.9235434.4643830.1939613.13662547.25129
37.2833.2477028.4863038.0091033.2477016.06087150.43453
42.0436.3014430.1215242.4813536.3014418.66887053.93401
21.1525.6133519.9237831.3029225.613358.14660043.08010
34.3827.1402222.1514232.1290227.140229.88900944.39143
26.6730.1939625.9235434.4643830.1939613.13662547.25129
28.3933.2477028.4863038.0091033.2477016.06087150.43453
40.5036.3014430.1215242.4813536.3014418.66887053.93401
35.5325.6133519.9237831.3029225.613358.14660043.08010
22.3027.1402222.1514232.1290227.140229.88900944.39143
20.7530.1939625.9235434.4643830.1939613.13662547.25129
20.3533.2477028.4863038.0091033.2477016.06087150.43453
47.3236.3014430.1215242.4813536.3014418.66887053.93401
44.7555.4686849.7791161.1582555.4686838.00193472.93543
74.1656.9955552.0067561.9843556.9955539.74434274.24676
62.1160.0492955.7788764.3197260.0492942.99195977.10663
57.4163.1030358.3416367.8644363.1030345.91620480.28986
60.7866.1567759.9768672.3366966.1567748.52420383.78934
41.5455.4686849.7791161.1582555.4686838.00193472.93543
61.1056.9955552.0067561.9843556.9955539.74434274.24676
63.6760.0492955.7788764.3197260.0492942.99195977.10663
62.3163.1030358.3416367.8644363.1030345.91620480.28986
65.2166.1567759.9768672.3366966.1567748.52420383.78934
51.8055.4686849.7791161.1582555.4686838.00193472.93543
74.0456.9955552.0067561.9843556.9955539.74434274.24676
60.6960.0492955.7788764.3197260.0492942.99195977.10663
56.5263.1030358.3416367.8644363.1030345.91620480.28986
69.2366.1567759.9768672.3366966.1567748.52420383.78934
# Gráficos de diagnósticos
par(mfrow = c(1, 1))
plot(modelo.final, 1) # Residuos vs. Estimados
plot(modelo.final, 2) # Normal Q-Q
plot(modelo.final, 3) # Residuos estandarizados vs Estimados
plot(modelo.final, 4) # Cook's distance
plot(modelo.final, 5) # Residuos vs. leverage
plot(modelo.final, 6) # Cook's distance vs. leverage



Códigos para el software SAS


/***********************************************************
 EXPERIMENTACIÓN EN AGRICULTURA                            
 Fernández Escobar, R.; Trapero, A.; Domínguez, J. 2010     
 CAPÍTULO 16: REGRESIÓN MÚLTIPLE
 REGRESIÓN CON MÁS DE DOS VARIABLES INDEPENDIENTES (pp. 226-231)
************************************************************/

DATA SUELO;
INPUT N_MINERAL TRAT GROSOR PROF;

LABEL
N_MINERAL= "mg N/kg suelo"
TRAT     = "kg N/árbol"
GROSOR   = "mm"
PROF     = "cm";

CARDS;
26.34     0    0.5  20
28.25     0    0.5  30
26.24     0    0.5  50
37.28     0    0.5  70
42.04     0    0.5  90
21.15     0    1    20
34.38     0    1    30
26.67     0    1    50
28.39     0    1    70
40.5      0    1    90
35.53     0    2    20
22.3      0    2    30
20.75     0    2    50
20.35     0    2    70
47.32     0    2    90
44.75     1    0.5  20
74.16     1    0.5  30
62.11     1    0.5  50
57.41     1    0.5  70
60.78     1    0.5  90
41.54     1    1    20
61.1      1    1    30
63.67     1    1    50
62.31     1    1    70
65.21     1    1    90
51.8      1    2    20
74.04     1    2    30
60.69     1    2    50
56.52     1    2    70
69.23     1    2    90
;

ODS HTML;
* Mostrar datos a analizar;
PROC PRINT DATA = SUELO;
RUN;

* El primer procedimiento: Best Subset Regression Models;
PROC REG;
     MODEL N_MINERAL = PROF TRAT GROSOR  / SELECTION = CP ADJRSQ RSQUARE PRESS MSE SSE SBC SP PC B BIC AIC JP;
RUN;

* El segundo procedimiento: Stepwise Linear Regression;
PROC REG;
     MODEL N_MINERAL = PROF TRAT GROSOR  / SELECTION = STEPWISE;
RUN;

* Regresión múltiple;
PROC REG;
     MODEL N_MINERAL = PROF TRAT / R P CLM CLI influence;
RUN;

* Correlación;
PROC CORR PEARSON;
     VAR N_MINERAL PROF TRAT GROSOR; *Valor de Correlación Múltiple es tan grande como el mayor de los coeficientes simples;
RUN;

* Correlación Parcial;
PROC CORR;
     VAR GROSOR;
     WITH N_MINERAL;
     PARTIAL PROF TRAT;
RUN;
QUIT;
ODS HTML CLOSE;

------------------------------------------------------------




No hay comentarios:

Publicar un comentario