project2

January 1, 0001   

library(tidyr)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tibble)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(plotROC) 
library(boot)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 3.0-1
library(sandwich)

##Introduction My dataset includes patient information on physician office visits from the package Ecdat. The variables included in this dataset are “ofp” number of physician office visits, “ofnp” number of nonphysician office visits, “opp” number of physician outpatient visits, “opnp” number of nonphysician outpatient visits, “emr” number of emergency room visits, “hosp” number of hospitalizations, “numchron” number of chronic conditions, binary “adldiff” if the person has a condition that limits activities of daily living, “age” age in years (divided by 10), binary “black” if the person african–american, “sex” if the person is male or female, binary “married” if the person is married, “school” number of years of education, “faminc” family income in $10000, binary “employed” if the person employed, binary “privins” if the person covered by private health insurance, binary “medicaid” is the person covered by medicaid, “region” the region (northeast, midwest,west, other), and “hlth” self-perceived health (excellent, poor, other).

The purpose of this project is to determine variables with and without interaction that account for if an individual has a condition that limits activities of daily living.

visits <- read.csv("OFP.csv")
head(visits)
##   X ofp ofnp opp opnp emr hosp numchron adldiff age black    sex maried
## 1 1   5    0   0    0   0    1        2       0 6.9   yes   male    yes
## 2 2   1    0   2    0   2    0        2       0 7.4    no female    yes
## 3 3  13    0   0    0   3    3        4       1 6.6   yes female     no
## 4 4  16    0   5    0   1    1        2       1 7.6    no   male    yes
## 5 5   3    0   0    0   0    0        2       1 7.9    no female    yes
## 6 6  17    0   0    0   0    0        5       1 6.6    no female     no
##   school faminc employed privins medicaid region  hlth
## 1      6 2.8810      yes     yes       no  other other
## 2     10 2.7478       no     yes       no  other other
## 3     10 0.6532       no      no      yes  other  poor
## 4      3 0.6588       no     yes       no  other  poor
## 5      6 0.6588       no     yes       no  other other
## 6      7 0.3301       no      no      yes  other  poor

##MANOVA

#Normality
ggplot(visits, aes(x = ofp, y = numchron)) +
 geom_point(alpha = .5) + geom_density_2d(h=2) + coord_fixed() + facet_wrap(~region)

#Homogeneity
covmats<-visits%>%group_by(region)%>%do(covs=cov(.[2:3]))
for(i in 1:3){print(covmats$covs[i])}
## [[1]]
##            ofp      ofnp
## ofp  40.582725  6.319781
## ofnp  6.319781 37.474543
## 
## [[1]]
##           ofp     ofnp
## ofp  54.43531 16.08302
## ofnp 16.08302 31.62989
## 
## [[1]]
##            ofp      ofnp
## ofp  43.491422  5.260384
## ofnp  5.260384 16.457533
#MANOVA
man1<-manova(cbind(ofp,opp,emr,hosp,numchron,age,school,faminc)~region, data=visits)
summary(man1)
##             Df   Pillai approx F num Df den Df    Pr(>F)    
## region       3 0.044757    8.324     24  13191 < 2.2e-16 ***
## Residuals 4402                                              
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Univariate ANOVA
summary.aov(man1)
##  Response ofp :
##               Df Sum Sq Mean Sq F value   Pr(>F)   
## region         3    583 194.286   4.262 0.005164 **
## Residuals   4402 200669  45.586                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response opp :
##               Df Sum Sq Mean Sq F value Pr(>F)
## region         3      3   1.136  0.0851 0.9682
## Residuals   4402  58771  13.351               
## 
##  Response emr :
##               Df  Sum Sq Mean Sq F value Pr(>F)
## region         3    1.92 0.63973  1.2923 0.2752
## Residuals   4402 2179.15 0.49504               
## 
##  Response hosp :
##               Df  Sum Sq Mean Sq F value Pr(>F)
## region         3    0.78 0.26003  0.4666 0.7056
## Residuals   4402 2453.29 0.55731               
## 
##  Response numchron :
##               Df Sum Sq Mean Sq F value   Pr(>F)   
## region         3   26.3  8.7558  4.8194 0.002368 **
## Residuals   4402 7997.5  1.8168                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response age :
##               Df  Sum Sq Mean Sq F value Pr(>F)
## region         3    0.19 0.06341  0.1579 0.9246
## Residuals   4402 1767.10 0.40143               
## 
##  Response school :
##               Df Sum Sq Mean Sq F value    Pr(>F)    
## region         3   1941  647.07  47.766 < 2.2e-16 ***
## Residuals   4402  59633   13.55                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response faminc :
##               Df Sum Sq Mean Sq F value    Pr(>F)    
## region         3    507 169.017  20.016 6.989e-13 ***
## Residuals   4402  37171   8.444                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#t-tests
pairwise.t.test(visits$ofp,visits$region,
 p.adj="none")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  visits$ofp and visits$region 
## 
##         midwest noreast other 
## noreast 0.0258  -       -     
## other   0.5152  0.0740  -     
## west    0.0020  0.4080  0.0069
## 
## P value adjustment method: none
pairwise.t.test(visits$opp,visits$region,
 p.adj="none")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  visits$opp and visits$region 
## 
##         midwest noreast other
## noreast 0.62    -       -    
## other   0.76    0.80    -    
## west    0.87    0.76    0.92 
## 
## P value adjustment method: none
pairwise.t.test(visits$emr,visits$region,
 p.adj="none")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  visits$emr and visits$region 
## 
##         midwest noreast other
## noreast 0.647   -       -    
## other   0.076   0.264   -    
## west    0.171   0.393   0.903
## 
## P value adjustment method: none
pairwise.t.test(visits$hosp,visits$region,
 p.adj="none")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  visits$hosp and visits$region 
## 
##         midwest noreast other
## noreast 0.37    -       -    
## other   0.87    0.27    -    
## west    0.87    0.33    0.98 
## 
## P value adjustment method: none
pairwise.t.test(visits$numchron,visits$region,
 p.adj="none")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  visits$numchron and visits$region 
## 
##         midwest noreast other  
## noreast 0.64740 -       -      
## other   0.00053 0.00814 -      
## west    0.36993 0.67854 0.03307
## 
## P value adjustment method: none
pairwise.t.test(visits$age,visits$region,
 p.adj="none")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  visits$age and visits$region 
## 
##         midwest noreast other
## noreast 0.62    -       -    
## other   0.71    0.85    -    
## west    0.89    0.56    0.64 
## 
## P value adjustment method: none
pairwise.t.test(visits$school,visits$region,
 p.adj="none")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  visits$school and visits$region 
## 
##         midwest noreast other  
## noreast 0.65    -       -      
## other   7.2e-09 2.1e-06 -      
## west    3.0e-10 3.5e-10 < 2e-16
## 
## P value adjustment method: none
pairwise.t.test(visits$faminc,visits$region,
 p.adj="none")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  visits$faminc and visits$region 
## 
##         midwest noreast other  
## noreast 0.2079  -       -      
## other   0.0019  3.4e-05 -      
## west    6.7e-06 0.0024  5.0e-14
## 
## P value adjustment method: none
#bonferroni 
1+8+48
## [1] 57
0.05/57
## [1] 0.000877193
#Type I Error calc
1 - 0.95^57
## [1] 0.9462665

It is likely that independent observations were made but it isn’t known if this is a random sample. It might not be likely that homogeneity, multicollinearity, linear relationships among dependent variables and multivariance were met. I do not expect there to be extreme uni- or multivariaant outliers.

Assuming that the data passes the MANOVA assumptions, our hypothesis is that the numeric variables have equal means of the group for the categorical variable region. I left out the numeric variables number of nonphysician office visits and number of nonphysician outpatient visits because it would be redundant with number of physician office visits and number of physician outpatient visits included.

The MANOVA found that there is significantly different means among the numeric variables for region (p-value << 0.05). The bonferroni adjustment was made to control Type I error to correct alpha to 0.000877193 where from the follow up univariate ANOVA tests school and family income variables were found to have significantly mean difference across regions (p-value < 0.00088).

Post hoc analysis was done for pairwise comparisons of the significant variables to find what regions differed in number of years of school and family income. Only the midwest and north east regions not significantly differ in means for number of years in school. The midwest and west, other and north east, and west and other were the regions that significantly differed in family income. This analysis was done according to the bonferroni adjustment for multiple comparisons.

A total of 57 tests were done for this analysis (1 MANOVA, 8 ANOVA and 48 t-tests). There is a 0.9462 probability of at least one Type I error. The bonferroni test kept the error rate at 0.05 by adjusting to 0.00088 as discussed.

##Randomization Test

#null distribution
visits %>% ggplot(aes(faminc,fill=privins))+geom_histogram(bins=6.5)+facet_wrap(~privins) + xlab("Family Income ($10,000)") + ggtitle("Private Insurance for Family Incomes")

#mean diff
visits %>% group_by(privins) %>% summarize(means=mean(faminc)) %>% summarize(diff(means))
## # A tibble: 1 x 1
##   `diff(means)`
##           <dbl>
## 1          1.11
#randomization
set.seed(348)

rand_dist<-vector()
for(i in 1:5000){
rand<-data.frame(income=sample(visits$faminc),insurance=visits$privins)
rand_dist[i]<-mean(rand[rand$insurance=="yes",]$income)-
 mean(rand[rand$insurance=="no",]$income)}

#p-value
mean(rand_dist>1.112961)*2
## [1] 0
#t-test
t.test(data=visits,faminc~privins)
## 
##  Welch Two Sample t-test
## 
## data:  faminc by privins
## t = -13.842, df = 2659.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.270625 -0.955296
## sample estimates:
##  mean in group no mean in group yes 
##          1.662984          2.775944
{hist(rand_dist,main="Test Statistic Distribution",ylab="Frequency",xlim = c(-.4,1.2)); abline(v = 1.112961,col="red")}

The null hypothesis for this randomization test is that mean family income is the same for individuals with and without private insurance. The alternate hypothesis is that mean family income is different across individuals with and withouot private insurance.

##Linear Regression Model

#fit with interaction
visits$ofp_c<-visits$ofp-mean(visits$ofp)
visits$faminc_c<-visits$faminc-mean(visits$faminc)
fit <- lm(numchron ~ ofp_c*maried*faminc_c, data=visits)
summary(fit)
## 
## Call:
## lm(formula = numchron ~ ofp_c * maried * faminc_c, data = visits)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2727 -1.0343 -0.3119  0.6588  6.7424 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               1.589926   0.031457  50.543   <2e-16 ***
## ofp_c                     0.046633   0.004478  10.414   <2e-16 ***
## mariedyes                -0.066920   0.041479  -1.613   0.1067    
## faminc_c                  0.005960   0.014866   0.401   0.6885    
## ofp_c:mariedyes           0.006964   0.006037   1.154   0.2487    
## ofp_c:faminc_c           -0.004273   0.001902  -2.246   0.0247 *  
## mariedyes:faminc_c       -0.031087   0.016807  -1.850   0.0644 .  
## ofp_c:mariedyes:faminc_c  0.005468   0.002346   2.331   0.0198 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.301 on 4398 degrees of freedom
## Multiple R-squared:  0.07292,    Adjusted R-squared:  0.07144 
## F-statistic: 49.42 on 7 and 4398 DF,  p-value: < 2.2e-16
#Plot
ggplot(visits, aes(x=ofp_c, y=numchron,group=maried))+geom_point(aes(color=maried))+
 geom_smooth(method="lm",formula=y~1,se=F,fullrange=T,aes(color=maried))+xlab("Family Income ($10,000)") + ylab("Number of  Chronic Illnesses") +ggtitle("Number of Chronic Illnesses by Family Income")

#Assumptions
resids<-fit$residuals; fitvals<-fit$fitted.values
ggplot()+geom_point(aes(fitvals,resids))+geom_hline(yintercept=0, col="red")

bptest(fit)
## 
##  studentized Breusch-Pagan test
## 
## data:  fit
## BP = 120.01, df = 7, p-value < 2.2e-16
ggplot()+geom_histogram(aes(resids),bins=20)

ggplot()+geom_qq(aes(sample=resids))+geom_qq_line()

ks.test(resids, "pnorm", sd=sd(resids))
## Warning in ks.test(resids, "pnorm", sd = sd(resids)): ties should not be
## present for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  resids
## D = 0.13155, p-value < 2.2e-16
## alternative hypothesis: two-sided
#robust standard errors
summary(fit)$coef[,1:2]
##                              Estimate  Std. Error
## (Intercept)               1.589925786 0.031456758
## ofp_c                     0.046633493 0.004477811
## mariedyes                -0.066920123 0.041479153
## faminc_c                  0.005960421 0.014865905
## ofp_c:mariedyes           0.006963694 0.006036566
## ofp_c:faminc_c           -0.004272970 0.001902349
## mariedyes:faminc_c       -0.031087335 0.016806799
## ofp_c:mariedyes:faminc_c  0.005467513 0.002345572
coeftest(fit, vcov = vcovHC(fit))[,1:2]
##                              Estimate  Std. Error
## (Intercept)               1.589925786 0.033875220
## ofp_c                     0.046633493 0.006720185
## mariedyes                -0.066920123 0.043389741
## faminc_c                  0.005960421 0.017991131
## ofp_c:mariedyes           0.006963694 0.009382407
## ofp_c:faminc_c           -0.004272970 0.002789926
## mariedyes:faminc_c       -0.031087335 0.020341280
## ofp_c:mariedyes:faminc_c  0.005467513 0.003488509
#fit w/o interaction
fit2 <- lm(numchron ~ ofp_c+maried+faminc_c, data=visits)
summary(fit2)
## 
## Call:
## lm(formula = numchron ~ ofp_c + maried + faminc_c, data = visits)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3095 -1.0708 -0.3061  0.6535  6.8883 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.569889   0.029633  52.977  < 2e-16 ***
## ofp_c        0.052171   0.002901  17.982  < 2e-16 ***
## mariedyes   -0.051094   0.040695  -1.256  0.20935    
## faminc_c    -0.019085   0.006927  -2.755  0.00589 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.301 on 4402 degrees of freedom
## Multiple R-squared:  0.07104,    Adjusted R-squared:  0.07041 
## F-statistic: 112.2 on 3 and 4402 DF,  p-value: < 2.2e-16

This linear regression used number of chronic illnesses as the response variable and mean centered physician’s office visits. marital status and family income as the predictors with interaction. The intercept, 1.59, is the predicted number of chronic illnessess when physician’s office visits and family income is 0 USD and unmarried. While holding marriage status and family income constant, for every 1 increase centered physician’s office visits the number of chronic illnesses is expected to increase by 0.0466. While holding physician’s office visits and family income constant, a married individudal will have 0.0669 less chronic illnesses than a person who isn’t married. While holding physician’s office visits and marital status constant, for every $1 increase in centered family income, the number of chronic illnessess will increase by 0.00596. The expected physicians office visits for non-married individuals is 0.006964 more than for those who are married. Family income is expected to be 0.0311 USD less for nonmarried people than married people. For each increase in physicians office vist, family income is expected to decrease by 0.004273 USD. For individuals who are not married, with each increase of physician’s office visits, family income is expected to increase 0.005458 USD more than for those who are married with each increase of physician’s office vists.

This data does not pass linearity, homoskedasticity, or normality.

Using robust standard error, the coefficients did not change. The SE’s increased with robust standard errors.

This model explains 7.292% of the variation in the outcome.

When running the model without interaction, one of the predictors (family income) is now significant.

##Bootstrapped Standard Errors of Linear Regression Model

samp_distn<-replicate(5000, {
 boot_dat<-boot_dat<-visits[sample(nrow(visits),replace=TRUE),]
 fit3<-lm(numchron ~ faminc_c * maried*ofp_c, data=boot_dat)
 coef(fit3)
})

samp_distn%>%t%>%as.data.frame%>%summarize_all(sd)
##   (Intercept)   faminc_c mariedyes      ofp_c faminc_c:mariedyes
## 1  0.03338738 0.01741327 0.0427156 0.00653458          0.0195547
##   faminc_c:ofp_c mariedyes:ofp_c faminc_c:mariedyes:ofp_c
## 1    0.002683802     0.009085953              0.003286179

The bootstrapped standard errors are much more similar to the robust standard errors than the original, however they are slightly less. ##Logistic Regression

#regression
fit4<-glm(adldiff~ofp_c+faminc_c,data=visits,family=binomial(link="logit"))
coeftest(fit4)
## 
## z test of coefficients:
## 
##               Estimate Std. Error  z value  Pr(>|z|)    
## (Intercept) -1.4128048  0.0392614 -35.9846 < 2.2e-16 ***
## ofp_c        0.0352047  0.0050362   6.9904 2.741e-12 ***
## faminc_c    -0.1358598  0.0204638  -6.6390 3.158e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef(fit4))
## (Intercept)       ofp_c    faminc_c 
##   0.2434595   1.0358317   0.8729650
#confusion matrix
visits$prob<-predict(fit4,type="response") 
table(predict=as.numeric(visits$prob>.5),truth=visits$adldiff)%>%addmargins
##        truth
## predict    0    1  Sum
##     0   3492  890 4382
##     1     15    9   24
##     Sum 3507  899 4406
#Accuracy
(3492+9)/4406
## [1] 0.7945983
#Sensitivity (TPR)
9/899
## [1] 0.01001112
#Specificity (TNR)
3492/3507
## [1] 0.9957228
#Recall (PPV)
9/24
## [1] 0.375
#plot
visits$logit<-predict(fit4) #get predicted log-odds
visits$outcome<-factor(visits$adldiff,levels=c("0","1"))
ggplot(visits,aes(logit, fill=outcome))+geom_density(alpha=.3)+
  geom_vline(xintercept=0,lty=2)

#ROC and AUC
ROCplot<-ggplot(visits)+geom_roc(aes(d=adldiff,m=prob), n.cuts=0)+
 geom_segment(aes(x=0,xend=1,y=0,yend=1),lty=2)
ROCplot

calc_auc(ROCplot)
##   PANEL group       AUC
## 1     1    -1 0.6262655
#function
class_diag<-function(probs,truth){
 tab<-table(factor(probs>.5,levels=c("FALSE","TRUE")),truth)
 acc=sum(diag(tab))/sum(tab)
 sens=tab[2,2]/colSums(tab)[2]
 spec=tab[1,1]/colSums(tab)[1]
 ppv=tab[2,2]/rowSums(tab)[2]
 if(is.numeric(truth)==FALSE & is.logical(truth)==FALSE) truth<-as.numeric(truth)-1
 #CALCULATE EXACT AUC
 ord<-order(probs, decreasing=TRUE)
 probs <- probs[ord]; truth <- truth[ord]
 TPR=cumsum(truth)/max(1,sum(truth))
 FPR=cumsum(!truth)/max(1,sum(!truth))
 dup<-c(probs[-1]>=probs[-length(probs)], FALSE)
 TPR<-c(0,TPR[!dup],1); FPR<-c(0,FPR[!dup],1)
 n <- length(TPR)
 auc<- sum( ((TPR[-1]+TPR[-n])/2) * (FPR[-1]-FPR[-n]) )
 data.frame(acc,sens,spec,ppv,auc)
} 

#CV 10 fold
set.seed(1234)
k=10 
visitscv<-visits[sample(nrow(visits)),] 
folds<-cut(seq(1:nrow(visits)),breaks=k,labels=F) 
diags<-NULL
for(i in 1:k){

 train<-visitscv[folds!=i,]
 test<-visitscv[folds==i,]
 truth<-test$adldiff

 fit5<-glm(adldiff~ofp+faminc,data=train,family="binomial")
 probs<-predict(fit5,newdata = test,type="response")

 diags<-rbind(diags,class_diag(probs,truth))
}
apply(diags,2,mean) 
##         acc        sens        spec         ppv         auc 
## 0.794836116 0.008991227 0.996312405         NaN 0.625437875

This logistic regression used the binary variable “adldiff” as the response variable and mean centered physician’s office visits and family income as predictors. When an individual has a family income of 0 and has never visited a physician’s office, their odds for having a condition limiting daily activity is 0.2801. While holding family income constant, odds of a condition limiting daily activity decreases 1.0358 with each increase in visit to a physician’s office. While holding physician office visits constant, each $10,000 increase in family income decreases the odds of a condition limiting daily activity by 0.1359. All predictors have a p-value less than 0.05, indicating the model is significant.

This model is about 79.46% accurate, however it is important to note, it has an extremely low true positive rate and near perfect true negative rate.

The AUC is close to 0.5, indicating that we can’t really predict individuals having a condition that limits their daily activity by family income and physicians office visits. ##Lasso Regression

visitslasso <- visits%>%dplyr::select(-prob,-outcome,-X,-logit)
y<-as.matrix(visitslasso$adldiff)
fit6 <- glm(adldiff~.-1, data = visitslasso, family = "binomial")
x<-model.matrix(fit6)
x<-scale(x)
cv<-cv.glmnet(x,y) 
lasso1<-glmnet(x,y,lambda=cv$lambda.1se)
coef(lasso1)
## 25 x 1 sparse Matrix of class "dgCMatrix"
##                         s0
## (Intercept)    0.204039946
## ofp            .          
## ofnp           .          
## opp            .          
## opnp           .          
## emr            0.009571913
## hosp           0.001373959
## numchron       0.035682668
## age            0.082327961
## blackno        .          
## blackyes       .          
## sexmale       -0.003289198
## mariedyes     -0.009840636
## school        -0.002073423
## faminc         .          
## employedyes    .          
## privinsyes     .          
## medicaidyes    0.034984107
## regionnoreast  .          
## regionother    .          
## regionwest     .          
## hlthother      .          
## hlthpoor       0.081731447
## ofp_c          .          
## faminc_c       .
#CV 
set.seed(1234)
k=10 
visitscv2<-visitslasso[sample(nrow(visitslasso)),] 
folds<-cut(seq(1:nrow(visitslasso)),breaks=k,labels=F) 
diags2<-NULL
for(i in 1:k){
 train2<-visitscv2[folds!=i,]
 test2<-visitscv2[folds==i,]
 truth2<-test2$adldiff
 fit7<-glm(adldiff~emr+hosp+numchron+age+sex+maried+school+medicaid+hlth,data=train2,family="binomial")
 probs2<-predict(fit7,newdata = test2,type="response")
 diags2<-rbind(diags2,class_diag(probs2,truth2))
}
diags2%>%summarize_all(mean)
##         acc      sens      spec       ppv       auc
## 1 0.8257045 0.3343958 0.9516561 0.6392141 0.8187942

The LASSO regression shows that Emergency room visits, hospital visits, number of chronic illnesses, age, sex, marital status, years in school, medicaid coverage, and self reported health status to be the most important predictors of if an individual has a condition that limits their daily life. Running a 10 fold CV demonstrated a much higher AUC than from the model that only took into account family income and physician’s office visits.



comments powered by Disqus