Install R and then RStudio

First Step, R Itself

Second Step, RStudio

RStudio

Six Data Sets to Download

If you would like to follow the low birth weight logistic regression example from 2016, here are the data:

SPSS Low Birth Weight

Seven Packages to Install

Getting Data Into and Out Of R

Import

Plain Text

?file.show()
file.show("c:/chuck/NYU/Stat Group/table1.csv")
?read.table()
read.table(file.choose(), sep = ",", header = TRUE)

table1 <- read.table("c:/chuck/NYU/Stat Group/table1.csv", sep = ",", header = TRUE)

table1

  id female inc80 inc81 inc82
1  1      0  5000  5500  6000
2  2      1  2000  2200  3300
3  3      0  3000  2000  1000

MS-Excel

library(openxlsx)

?read.xlsx

args(read.xlsx)

function (xlsxFile, sheet, startRow = 1, colNames = TRUE, rowNames = FALSE, 
    detectDates = FALSE, skipEmptyRows = TRUE, skipEmptyCols = TRUE, 
    rows = NULL, cols = NULL, check.names = FALSE, sep.names = ".", 
    namedRegion = NULL, na.strings = "NA", fillMergedCells = FALSE) 
NULL

hsiteA <- read.xlsx("c:/chuck/NYU/Stat Group/Hands-On-R-Practice/Health.xlsx", sheet = 1)
hsiteB <- read.xlsx("c:/chuck/NYU/Stat Group/Hands-On-R-Practice/Health.xlsx", sheet = "Site B", detectDates = TRUE)

hsiteA

   Site Patient Score  Date
1     A       1    47 43005
2     A       2    54 43006
3     A       3    68 43007
4     A       4    55 43008
5     A       5    36 43009
6     A       6    52 43009
7     A       7    28 43009
8     A       8    49 43009
9     A       9    39 43009
10    A      10    37 43009

hsiteB

   Site Patient Score       Date
1     B      11    57 2017-09-27
2     B      12    59 2017-09-28
3     B      13    54 2017-09-29
4     B      14    59 2017-09-30
5     B      15    64 2017-10-01
6     B      16    56 2017-10-01
7     B      17    62 2017-10-01
8     B      18    56 2017-10-01
9     B      19    56 2017-10-01
10    B      20    52 2017-10-01

SPSS

library(foreign)

?read.spss

bank <- read.spss("c:/chuck/NYU/Stat Group/bank.sav", to.data.frame = TRUE)

head(bank)

   ID SALBEG   SEX TIME   AGE SALNOW EDLEVEL  WORK          JOBCAT MINORITY
1 628   8400 Males   81 28.50  16080      16  0.25 College trainee    White
2 630  24000 Males   73 40.33  41400      16 12.50 Exempt employee    White
3 632  10200 Males   83 31.08  21960      15  4.08 Exempt employee    White
4 633   8700 Males   93 31.17  19200      16  1.83 College trainee    White
5 635  17400 Males   83 41.92  28350      19 13.00 Exempt employee    White
6 637  12996 Males   80 29.50  27250      18  2.42 College trainee    White
      SEXRACE EGENDER EMINORIT EGENXMIN BSALCENT   CSALADJ SALDELTA     RES_1
1 White males      -1       -1        1  1593.57 13133.489     7680 -1013.504
2 White males      -1       -1        1 17193.57  9609.089    17400 -4543.634
3 White males      -1       -1        1  3393.57 15685.289    11760  1537.635
4 White males      -1       -1        1  1893.57 15698.789    10500  1551.686
5 White males      -1       -1        1 10593.57  8762.489    10950 -5387.809
6 White males      -1       -1        1  6189.57 15805.485    14254  1656.804
  GENDER
1      M
2      M
3      M
4      M
5      M
6      M

Stata

?read.dta

nhanes_dta <- read.dta("c:/chuck/NYU/Stat Group/nhanes.dta")

head(nhanes_dta)

  age  bmi hyp chl
1   1   NA  NA  NA
2   2 22.7   1 187
3   1   NA   1 187
4   3   NA  NA  NA
5   1 20.4   1 113
6   3   NA  NA 184

SAS

library(sas7bdat)

?read.sas7bdat

hsb2_sas <- read.sas7bdat("c:/chuck/NYU/Stat Group/Hands-On-R-Practice/hsb2.sas7bdat")

head(hsb2_sas)

  id female race ses schtyp prog read write math science socst
1  3      0    1   1      1    2   63    65   48      63    56
2  5      0    1   1      1    2   47    40   43      45    31
3 16      0    1   1      1    3   47    31   44      36    36
4 35      1    1   1      2    1   60    54   50      50    51
5  8      1    1   1      1    2   39    44   52      44    48
6 19      1    1   1      1    1   28    46   43      44    51

Export

Plain Text

head(iris)

  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa

?write.table

write.table(iris, file = "c:/chuck/NYU/Stat Group/iris.csv", sep = ",", col.names = TRUE, row.names = FALSE)

file.show("c:/chuck/NYU/Stat Group/iris.csv")

MS-Excel

?write.xlsx

write.xlsx(iris, file = "c:/chuck/NYU/Stat Group/iris.xlsx")

SPSS

?write.foreign

write.foreign(iris, datafile = "c:/chuck/NYU/Stat Group/iris.dat", 
                    codefile = "c:/chuck/NYU/Stat Group/iris-read.sps", 
                    package = "SPSS")

file.show("c:/chuck/NYU/Stat Group/iris-read.sps")

Stata

?write.dta

write.dta(iris, file = "c:/chuck/NYU/Stat Group/iris.dta", version=10)

Data Management

library(dplyr)
library(tidyr)
library(hflights)

head(hflights)
summary(hflights)
hflights <- tbl_df(hflights)
glimpse(hflights)

Create New Variables

g1 <- mutate(hflights, ActualGroundTime = ActualElapsedTime - AirTime,
                       AverageSpeed = Distance / AirTime * 60)

summary(g1)

      Year          Month          DayofMonth      DayOfWeek        DepTime    
 Min.   :2011   Min.   : 1.000   Min.   : 1.00   Min.   :1.000   Min.   :   1  
 1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000   1st Qu.:1021  
 Median :2011   Median : 7.000   Median :16.00   Median :4.000   Median :1416  
 Mean   :2011   Mean   : 6.514   Mean   :15.74   Mean   :3.948   Mean   :1396  
 3rd Qu.:2011   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000   3rd Qu.:1801  
 Max.   :2011   Max.   :12.000   Max.   :31.00   Max.   :7.000   Max.   :2400  
                                                                 NA's   :2905  
    ArrTime     UniqueCarrier        FlightNum      TailNum         
 Min.   :   1   Length:227496      Min.   :   1   Length:227496     
 1st Qu.:1215   Class :character   1st Qu.: 855   Class :character  
 Median :1617   Mode  :character   Median :1696   Mode  :character  
 Mean   :1578                      Mean   :1962                     
 3rd Qu.:1953                      3rd Qu.:2755                     
 Max.   :2400                      Max.   :7290                     
 NA's   :3066                                                       
 ActualElapsedTime    AirTime         ArrDelay          DepDelay      
 Min.   : 34.0     Min.   : 11.0   Min.   :-70.000   Min.   :-33.000  
 1st Qu.: 77.0     1st Qu.: 58.0   1st Qu.: -8.000   1st Qu.: -3.000  
 Median :128.0     Median :107.0   Median :  0.000   Median :  0.000  
 Mean   :129.3     Mean   :108.1   Mean   :  7.094   Mean   :  9.445  
 3rd Qu.:165.0     3rd Qu.:141.0   3rd Qu.: 11.000   3rd Qu.:  9.000  
 Max.   :575.0     Max.   :549.0   Max.   :978.000   Max.   :981.000  
 NA's   :3622      NA's   :3622    NA's   :3622      NA's   :2905     
    Origin              Dest              Distance          TaxiIn       
 Length:227496      Length:227496      Min.   :  79.0   Min.   :  1.000  
 Class :character   Class :character   1st Qu.: 376.0   1st Qu.:  4.000  
 Mode  :character   Mode  :character   Median : 809.0   Median :  5.000  
                                       Mean   : 787.8   Mean   :  6.099  
                                       3rd Qu.:1042.0   3rd Qu.:  7.000  
                                       Max.   :3904.0   Max.   :165.000  
                                                        NA's   :3066     
    TaxiOut         Cancelled       CancellationCode      Diverted       
 Min.   :  1.00   Min.   :0.00000   Length:227496      Min.   :0.000000  
 1st Qu.: 10.00   1st Qu.:0.00000   Class :character   1st Qu.:0.000000  
 Median : 14.00   Median :0.00000   Mode  :character   Median :0.000000  
 Mean   : 15.09   Mean   :0.01307                      Mean   :0.002853  
 3rd Qu.: 18.00   3rd Qu.:0.00000                      3rd Qu.:0.000000  
 Max.   :163.00   Max.   :1.00000                      Max.   :1.000000  
 NA's   :2947                                                            
 ActualGroundTime  AverageSpeed   
 Min.   :  5.00   Min.   : 98.82  
 1st Qu.: 15.00   1st Qu.:389.36  
 Median : 20.00   Median :429.38  
 Mean   : 21.18   Mean   :421.02  
 3rd Qu.: 25.00   3rd Qu.:462.50  
 Max.   :176.00   Max.   :763.64  
 NA's   :3622     NA's   :3622

Subset

Selecting Variables

select(hflights, ActualElapsedTime, AirTime, ArrDelay, DepDelay)
select(hflights, Origin:Cancelled)
select(hflights, Year:DayOfWeek, ArrDelay:Diverted)
select(hflights, ends_with("Delay"))
select(hflights, UniqueCarrier, ends_with("Num"), starts_with("Cancel"))

Selecting Observations

filter(hflights, Distance >= 3000)
filter(hflights, UniqueCarrier %in% c("JetBlue","Southwest","Delta"))
filter(hflights, (TaxiOut + TaxiIn) > AirTime)
filter(hflights, DepTime < 500 | ArrTime > 2200)

Selecting Variables and Observations

hflights %>%
filter(Distance >= 3000) %>%
  select(UniqueCarrier, ends_with("Delay"))

Arrange

arrange(hflights, UniqueCarrier, DepDelay)

Aggregate

hflights %>%
  group_by(Month) %>%
  summarize(n = n(),
            Mean_Arrival_Delay = mean(ArrDelay, na.rm = TRUE),
            SD_Arrival_Delay = sd(ArrDelay, na.rm =TRUE))

Merge

members <- read.xlsx("c:/chuck/NYU/Stat Group/Hands-On-R-Practice/Bands.xlsx", 
                     sheet = "Members")
str(members)

'data.frame':   3 obs. of  2 variables:
 $ name: chr  "Mick" "John" "Paul"
 $ band: chr  "Stones" "Beatles" "Beatles"

instruments <- read.xlsx("c:/chuck/NYU/Stat Group/Hands-On-R-Practice/Bands.xlsx", 
                         sheet = "Instruments")
str(instruments)

'data.frame':   3 obs. of  2 variables:
 $ name : chr  "John" "Paul" "Keith"
 $ plays: chr  "Guitar" "Bass" "Guitar"

How to join band and instrument together in one data frame?
The variable name is a unique identifier or “key” present in both members and instruments

left_join

left_join(members, instruments, by = "name")

  name    band  plays
1 Mick  Stones   <NA>
2 John Beatles Guitar
3 Paul Beatles   Bass

right_join

right_join(members, instruments, by = "name")

   name    band  plays
1  John Beatles Guitar
2  Paul Beatles   Bass
3 Keith    <NA> Guitar

inner_join

inner_join(members, instruments, by = "name")

  name    band  plays
1 John Beatles Guitar
2 Paul Beatles   Bass

full_join

full_join(members, instruments, by = "name")

   name    band  plays
1  Mick  Stones   <NA>
2  John Beatles Guitar
3  Paul Beatles   Bass
4 Keith    <NA> Guitar

Rename Variables

head(mtcars)

                   mpg cyl disp  hp drat    wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

mtcars_new1 <- rename(mtcars, Cylinders = cyl)
head(mtcars_new1)

                   mpg Cylinders disp  hp drat    wt  qsec vs am gear carb
Mazda RX4         21.0         6  160 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag     21.0         6  160 110 3.90 2.875 17.02  0  1    4    4
Datsun 710        22.8         4  108  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive    21.4         6  258 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout 18.7         8  360 175 3.15 3.440 17.02  0  0    3    2
Valiant           18.1         6  225 105 2.76 3.460 20.22  1  0    3    1

mtcars_new1 %>% 
  select(mpg:wt) %>% 
  rename(`Miles per Gallon` = mpg) %>% 
  head()

                  Miles per Gallon Cylinders disp  hp drat    wt
Mazda RX4                     21.0         6  160 110 3.90 2.620
Mazda RX4 Wag                 21.0         6  160 110 3.90 2.875
Datsun 710                    22.8         4  108  93 3.85 2.320
Hornet 4 Drive                21.4         6  258 110 3.08 3.215
Hornet Sportabout             18.7         8  360 175 3.15 3.440
Valiant                       18.1         6  225 105 2.76 3.460

Reshape

Wide to Long

?gather

gather(iris, key = flower_att, value = measurement,
       Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)

gather(table1, Year, Income, -id, -female)

Long to Wide

?spread

long_DF <- expand.grid(Subject = 1:3, Time = c("BL","FU1","FU2"))
long_DF$Y <- sample(1:5, 9, replace = TRUE)
long_DF

  Subject Time Y
1       1   BL 3
2       2   BL 4
3       3   BL 5
4       1  FU1 1
5       2  FU1 2
6       3  FU1 2
7       1  FU2 1
8       2  FU2 5
9       3  FU2 2

spread(long_DF, key = Time, value = Y)

  Subject BL FU1 FU2
1       1  3   1   1
2       2  4   2   5
3       3  5   2   2

long_DF <- expand.grid(Subject = 1:3, Time = c("BL","FU1","FU2"), Item = c("Y1","Y2"))
long_DF$Response <- sample(1:5, 18, replace = TRUE)
long_DF

   Subject Time Item Response
1        1   BL   Y1        5
2        2   BL   Y1        3
3        3   BL   Y1        2
4        1  FU1   Y1        4
5        2  FU1   Y1        2
6        3  FU1   Y1        3
7        1  FU2   Y1        5
8        2  FU2   Y1        3
9        3  FU2   Y1        1
10       1   BL   Y2        1
11       2   BL   Y2        1
12       3   BL   Y2        2
13       1  FU1   Y2        4
14       2  FU1   Y2        4
15       3  FU1   Y2        2
16       1  FU2   Y2        5
17       2  FU2   Y2        3
18       3  FU2   Y2        4

unite(long_DF, Item_Time, Item, Time) %>%
spread(key = Item_Time, value = Response)

  Subject Y1_BL Y1_FU1 Y1_FU2 Y2_BL Y2_FU1 Y2_FU2
1       1     5      4      5     1      4      5
2       2     3      2      3     1      4      3
3       3     2      3      1     2      2      4

long_DF <- expand.grid(Subject = 1:3, Time = c("BL","FU1","FU2"))
long_DF$Y1 <- sample(1:5, 9, replace = TRUE)
long_DF$Y2 <- sample(1:5, 9, replace = TRUE)
long_DF$Y3 <- sample(1:5, 9, replace = TRUE)
long_DF

  Subject Time Y1 Y2 Y3
1       1   BL  3  4  1
2       2   BL  5  1  4
3       3   BL  5  1  4
4       1  FU1  5  3  3
5       2  FU1  3  1  2
6       3  FU1  4  1  3
7       1  FU2  4  1  2
8       2  FU2  2  4  1
9       3  FU2  5  5  2

long_DF %>%
  gather(Variable, Value, Y1, Y2, Y3) %>%
  unite(Variable_Time, Variable, Time) %>%
  spread(Variable_Time, Value)

  Subject Y1_BL Y1_FU1 Y1_FU2 Y2_BL Y2_FU1 Y2_FU2 Y3_BL Y3_FU1 Y3_FU2
1       1     3      5      4     4      3      1     1      3      2
2       2     5      3      2     1      1      4     4      2      1
3       3     5      4      5     1      1      5     4      3      2

There are other ways to do merging, filtering, variable selection, and reshaping using base R and the data.table package.

Looking for Lost Observations

Generate Data for Example

Raw Data

raw_data <- expand.grid(PATIENT = LETTERS[1:10], TIME = 1:3)

Processed Data with Missing Observations

set.seed(9417)
processed_data <- raw_data[sample(30, 24),]

Add One Duplicated Observation to Processed Data

processed_data <- bind_rows(processed_data, processed_data[1,])

Check Whether Raw and Processed Are Equal

setequal(raw_data, processed_data)

[1] FALSE

How Many Unique Patients

Raw Data

raw_data %>% 
  count(PATIENT) %>%
  nrow()

[1] 10

Processed Data

processed_data %>% 
  count(PATIENT) %>%
  nrow()

[1] 10

Observations in Raw But Not Processed

anti_join(raw_data, processed_data)

  PATIENT TIME
1       A    1
2       J    1
3       C    3
4       F    3
5       I    3
6       J    3

Observations in Processed But Not Raw

anti_join(processed_data, raw_data)

[1] PATIENT TIME   
<0 rows> (or 0-length row.names)

Check for Duplicates

Raw Data

any(duplicated(raw_data))

[1] FALSE

Processed Data

any(duplicated(processed_data))

[1] TRUE

Which Observations Are Duplicated?

processed_data[duplicated(processed_data),]

   PATIENT TIME
25       D    3

How Many Observations Appear in Both Raw and Processed?

semi_join(raw_data, processed_data) %>%
  nrow()

[1] 24

Analysis

Descriptive Statistics

Categorical

library(descr)

freq(iris$Species, plot=FALSE)

iris$Species 
           Frequency Percent
setosa            50   33.33
versicolor        50   33.33
virginica         50   33.33
Total            150  100.00

with(bank, CrossTable(GENDER, MINORITY))

   Cell Contents 
|-------------------------|
|                       N | 
| Chi-square contribution | 
|           N / Row Total | 
|           N / Col Total | 
|         N / Table Total | 
|-------------------------|

==================================
          MINORITY
GENDER    White   Nonwhite   Total
----------------------------------
M           194         64     258
          0.271      0.965        
          0.752      0.248   0.544
          0.524      0.615        
          0.409      0.135        
----------------------------------
F           176         40     216
          0.324      1.153        
          0.815      0.185   0.456
          0.476      0.385        
          0.371      0.084        
----------------------------------
Total       370        104     474
          0.781      0.219        
==================================

Continuous

summary(iris)

  Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
 Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
 Median :5.800   Median :3.000   Median :4.350   Median :1.300  
 Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
 Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
       Species  
 setosa    :50  
 versicolor:50  
 virginica :50

iris %>%
  select(-Species) %>%
  gather(Variable, Value) %>%
  group_by(Variable) %>%
  summarize(n = n(),
            Mean = mean(Value),
            SD = sd(Value),
            Median = median(Value),
            IQR = IQR(Value),
            Min = min(Value),
            Max = max(Value)) %>%
  knitr::kable()

Variable	n	Mean	SD	Median	IQR	Min	Max
Petal.Length	150	3.758000	1.7652982	4.35	3.5	1.0	6.9
Petal.Width	150	1.199333	0.7622377	1.30	1.5	0.1	2.5
Sepal.Length	150	5.843333	0.8280661	5.80	1.3	4.3	7.9
Sepal.Width	150	3.057333	0.4358663	3.00	0.5	2.0	4.4

Continuous By Categories

iris %>%
  gather(Variable, Value, -Species) %>%
  group_by(Species, Variable) %>%
  summarize(n = n(),
            Mean = mean(Value),
            SD = sd(Value),
            Median = median(Value),
            IQR = IQR(Value),
            Min = min(Value),
            Max = max(Value)) %>%
  knitr::kable()

Species	Variable	n	Mean	SD	Median	IQR	Min	Max
setosa	Petal.Length	50	1.462	0.1736640	1.50	0.175	1.0	1.9
setosa	Petal.Width	50	0.246	0.1053856	0.20	0.100	0.1	0.6
setosa	Sepal.Length	50	5.006	0.3524897	5.00	0.400	4.3	5.8
setosa	Sepal.Width	50	3.428	0.3790644	3.40	0.475	2.3	4.4
versicolor	Petal.Length	50	4.260	0.4699110	4.35	0.600	3.0	5.1
versicolor	Petal.Width	50	1.326	0.1977527	1.30	0.300	1.0	1.8
versicolor	Sepal.Length	50	5.936	0.5161711	5.90	0.700	4.9	7.0
versicolor	Sepal.Width	50	2.770	0.3137983	2.80	0.475	2.0	3.4
virginica	Petal.Length	50	5.552	0.5518947	5.55	0.775	4.5	6.9
virginica	Petal.Width	50	2.026	0.2746501	2.00	0.500	1.4	2.5
virginica	Sepal.Length	50	6.588	0.6358796	6.50	0.675	4.9	7.9
virginica	Sepal.Width	50	2.974	0.3224966	3.00	0.375	2.2	3.8

Inferential Statistics

Fisher’s Exact Test

midata <- data.frame(Condition = rep(c('Placebo','Aspirin'), each = 1000),
                     MI = rep(c('No','Yes','No','Yes'), c(980,20,990,10)))

with(midata, CrossTable(Condition, MI))
with(midata, fisher.test(Condition, MI))

Examples of t-test for SPSS

Single Sample t-test

with(hsb2_sas, t.test(write, mu = 50))


    One Sample t-test

data:  write
t = 4.1403, df = 199, p-value = 5.121e-05
alternative hypothesis: true mean is not equal to 50
95 percent confidence interval:
 51.45332 54.09668
sample estimates:
mean of x 
   52.775

Paired t-test

with(hsb2_sas, t.test(write, read, paired = TRUE))


    Paired t-test

data:  write and read
t = 0.86731, df = 199, p-value = 0.3868
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.6941424  1.7841424
sample estimates:
mean of the differences 
                  0.545

Independent Group t-test

with(hsb2_sas, t.test(write ~ female))


    Welch Two Sample t-test

data:  write by female
t = -3.6564, df = 169.71, p-value = 0.0003409
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
 -7.499159 -2.240734
sample estimates:
mean in group 0 mean in group 1 
       50.12088        54.99083

with(hsb2_sas, t.test(write ~ female, var.equal = TRUE))


    Two Sample t-test

data:  write by female
t = -3.7341, df = 198, p-value = 0.0002463
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
 -7.441835 -2.298059
sample estimates:
mean in group 0 mean in group 1 
       50.12088        54.99083

Correlation Coefficients

hsb2_sas %>%
  select(read:socst) %>%
  cor() %>%
  round(2)

        read write math science socst
read    1.00  0.60 0.66    0.63  0.62
write   0.60  1.00 0.62    0.57  0.60
math    0.66  0.62 1.00    0.63  0.54
science 0.63  0.57 0.63    1.00  0.47
socst   0.62  0.60 0.54    0.47  1.00

cor.test(~ math + science, data = hsb2_sas)


    Pearson's product-moment correlation

data:  math and science
t = 11.437, df = 198, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.5391745 0.7075569
sample estimates:
      cor 
0.6307332

Linear Regression

lm.1 <- lm(science ~ math + read, data = hsb2_sas)
summary(lm.1)


Call:
lm(formula = science ~ math + read, data = hsb2_sas)

Residuals:
     Min       1Q   Median       3Q      Max 
-17.5426  -4.2618  -0.4676   4.8989  22.1388 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.61550    3.05426   3.803 0.000191 ***
math         0.40172    0.07259   5.534 9.90e-08 ***
read         0.36542    0.06633   5.509 1.12e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.188 on 197 degrees of freedom
Multiple R-squared:  0.4782,    Adjusted R-squared:  0.4729 
F-statistic: 90.27 on 2 and 197 DF,  p-value: < 2.2e-16

confint(lm.1)

                2.5 %     97.5 %
(Intercept) 5.5922552 17.6387468
math        0.2585632  0.5448782
read        0.2346128  0.4962283

Reports with R Markdown

Title, Author, and Date

Very beginning of the Rmd file:

---
title: "Hands-On R Practice"
author: "Chuck Cleland"
date: "October 26, 2017"
output:
  html_document:
    highlight: tango
    theme: readable
---

---
title: "Hands-On R Practice"
author: "Chuck Cleland"
date: "October 26, 2017"
output:
  html_document:
    highlight: tango
    theme: readable
    toc: yes
---

Headers

# Header1
## Header 2
### Header 3
#### Header 4
##### Header 5
###### Header 6

Header1

Header 2

Header 3

Header 4

Header 5

Header 6

Bullet Point and Numbered Lists

Unordered

* unordered list
   + sub-item 1
   + sub-item 2
       - sub-sub-item 1
* item 2

unordered list
- sub-item 1
- sub-item 2
  - sub-sub-item 1
item 2

Ordered

1. ordered list
2. item 2
   i) sub-item 1
     A. sub-sub-item 1

ordered list
item 2
1. sub-item 1
1. sub-sub-item 1

R Code

```{r}

rnorm(10)

 ```

rnorm(10)

 [1] -1.0644655  0.1078503 -0.4054216  1.1376249 -0.3121775 -0.5842064
 [7] -1.3791827 -1.4536491  0.8062141  0.6957398

R Output

HTML

MS-Word

PDF

PDF output requires LaTeX

Examples of R Markdown Reports

My FitBit

Steiger Example

The R Markdown for This Page

Sites

Hands-On R Practice

Chuck Cleland

May 02, 2022 06:48:13 AM EDT

Install R and then RStudio

First Step, R Itself

Second Step, RStudio

Six Data Sets to Download

Seven Packages to Install

Getting Data Into and Out Of R

Import

Plain Text

MS-Excel

SPSS

Stata

SAS

Export

Plain Text

MS-Excel

SPSS

Stata

Data Management

Create New Variables

Subset

Selecting Variables

Selecting Observations

Selecting Variables and Observations

Arrange

Aggregate

Merge

left_join

right_join

inner_join

full_join

Rename Variables

Reshape

Wide to Long

Long to Wide

Looking for Lost Observations

Generate Data for Example

Raw Data

Processed Data with Missing Observations

Add One Duplicated Observation to Processed Data

Check Whether Raw and Processed Are Equal

How Many Unique Patients

Raw Data

Processed Data

Observations in Raw But Not Processed

Observations in Processed But Not Raw

Check for Duplicates

Raw Data

Processed Data

Which Observations Are Duplicated?

How Many Observations Appear in Both Raw and Processed?

Analysis

Descriptive Statistics

Categorical

Continuous

Continuous By Categories

Inferential Statistics

Fisher’s Exact Test

Single Sample t-test

Paired t-test

Independent Group t-test

Correlation Coefficients

Linear Regression

Reports with R Markdown

Title, Author, and Date

Table of Contents

Headers

Header1

Header 2

Header 3

Header 4

Header 5

Header 6

Bullet Point and Numbered Lists

Unordered

Ordered

R Code

R Output