290 likes | 407 Views
Creating Syndrome Definitions Using RStudio. Tim Hopper. Data Scientist RTI International. Code Is Available Online. https://gist.github.com/tdhopper/d5939aaf74886143224e/raw/3ae883a25ef078a5edd2fcced0f0268b34be3d6b/Custom+Syndromes. Setup. # Connect to TarrantCounty_FP database
E N D
Creating Syndrome Definitions Using RStudio Tim Hopper Data Scientist RTI International
Code Is Available Online https://gist.github.com/tdhopper/d5939aaf74886143224e/raw/3ae883a25ef078a5edd2fcced0f0268b34be3d6b/Custom+Syndromes
Setup # Connect to TarrantCounty_FP database # Credentials USERNAME <- 'username' PASSWORD <- 'password' HOSTNAME <- 'data3.biosen.se' DBNAME <- 'TarrantCounty_FP' TABLE <- 'TC_Meaningful_Use_Base' # Create database connection con <- dbConnect(dbDriver('MySQL'), user=USERNAME, password=PASSWORD, host=HOSTNAME, dbname=DBNAME)
Example: Co-morbid Syndrome • We want to see the co-occurrence of influenza (influenza-like illness) and asthma. • Data source: Texas region 2/3 • Location: Tarrant County • Time: February 1–October 31, 2013
Query for Asthma SELECT Facility_City, Facility_State, Diagnosis_Code, Diagnosis_Text, Chief_Complaint, Age, Gender, Visit_Date_Time, Row_Number FROM TC_Meaningful_Use_Base WHERE Visit_Date_Time BETWEEN '2013-02-01 00:00:00' AND '2013-10-31 23:59:59' AND (Diagnosis_Code LIKE '%493%')
Query for Influenza-Like Illness SELECT Facility_City, Facility_State, Diagnosis_Code, Diagnosis_Text, Chief_Complaint, Age, Gender, Visit_Date_Time, Row_Number FROM TC_Meaningful_Use_Base WHERE Visit_Date_Time BETWEEN '2013-02-01 00:00:00' AND '2013-10-31 23:59:59' AND (Diagnosis_Code LIKE '%487%' OR Diagnosis_Code LIKE '%488%' OR Diagnosis_Code LIKE '%V04.8%' OR Diagnosis_Code LIKE '%V0481%' OR Diagnosis_Code LIKE '%V06.6%' OR Diagnosis_Code LIKE '%V066%')
Run Query and Process Data # Run Query df.asthma <- dbGetQuery(con, query.asthma) df.ili <- dbGetQuery(con, query.ili) # Add column naming each as a syndrome df.asthma$Syndrome <- 'ASTHMA' df.ili$Syndrome <- 'ILI' # Combine these two data sets into one data.frame df <- rbind(df.asthma, df.ili) # Format dates and add date column (without time) df$Visit_Date_Time <- ymd_hms(df$Visit_Date_Time) df$Visit_Date <- as.Date(df$Visit_Date_Time)
Create Summary Data Set events.per.day.split <- ddply(df, .(Visit_Date, Syndrome), summarize, Number_of_Visits=length(Visit_Date)) ############################################ # Visit_Date Syndrome Number_of_Visits # 1 2013-02-01 ASTHMA 49 # 2 2013-02-01 ASTHMA & ILI 2 # 3 2013-02-01 ILI 5 # 4 2013-02-02 ASTHMA 60 # 5 2013-02-02 ILI 21 # 6 2013-02-03 ASTHMA 89 ############################################
Visits Per Day by Syndrome ggplot(events.per.day.split) + aes(Visit_Date, Number_of_Visits, color=Syndrome) + geom_line()
Create Summary Data Set events.per.day <- ddply(df, .(Visit_Date), summarize, Number_of_Visits=length(Visit_Date)) ############################### # Visit_Date Number_of_Visits # 1 2013-02-01 513 # 2 2013-02-02 396 # 3 2013-02-03 428 # 4 2013-02-04 409 # 5 2013-02-05 580 # 6 2013-02-06 391 ###############################
Visits Per Day by Syndrome ggplot(events.per.day) + aes(Visit_Date, Number_of_Visits) + geom_line()
Example: New Syndrome • We want to see create a new syndrome to identify visits during which the patient had cough AND dizziness AND headache. • Data source: Texas region 2/3 • Location: Tarrant County • Time: February 1–October 31, 2013
Query SELECT Facility_City, Facility_State, Diagnosis_Code, Diagnosis_Text, Chief_Complaint, Age, Gender, Visit_Date_Time, Row_Number FROM TC_Meaningful_Use_Base WHERE Visit_Date_Time BETWEEN '2013-02-01 00:00:00' AND '2013-10-31 23:59:59' AND (Diagnosis_Code LIKE '%786.2%' OR Diagnosis_Code LIKE '%7862%') AND (Diagnosis_Code LIKE '%780.4%' OR Diagnosis_Code LIKE '%7804%') AND (Diagnosis_Code LIKE '%784.0%' OR Diagnosis_Code LIKE '%7840%');
Run Query # Run Query df.sick <- dbGetQuery(con, query) # Fix dates using lubridate df.sick$Visit_Date_Time <- ymd_hms(df.sick$Visit_Date_Time) # Create a month column df.sick$Month <- month(df.sick$Visit_Date, label=T)
Run Query ggplot(df.sick) + aes(Month) + geom_histogram()
Create Line Listing write.csv(df.sick, 'sick.csv', quote=F, row.names=F) # sick.csv: # # Row_Number,Facility_City,Facility_State,Diagnosis_Code,D... # 1374852,Houston,TX,473.9:780.4:300.00:786.2:784.0:305.1:... # 1536525,Houston,TX,486:786.2:780.4:784.0:794.00:789.00:7... # 2100347,Rowlett,TX,780.4:784.0:786.2,NA,SCREENING - HA -... # 2189305,Rowlett,TX,780.4:784.0:786.2:V76.12,NA,SCREENING... # 3108090,Rowlett,TX,780.4:784.0:786.2:V76.12,NA,SCREENING... # 5887191,Rowlett,TX,786.2:780.1:780.4:784.0,NA,786.2:SEP:... # 7968958,Houston,TX,493.90:780.4:780.60:784.0:786.2:787.0... # 9197758,Houston,TX,493.90:780.4:780.60:784.0:786.2:787.0...
Example: Refined Age Groups • We want to see motor vehicle traffic accidents involving young people. We recombine the ages to the following groups: 0–15, 16–20, 21–25, 26–30, and 31–35 years. • Data source: Texas region 2/3 • Location: Tarrant County • Time: February 1–October 31, 2013
Query SELECT Facility_City, Facility_State, Diagnosis_Code, Diagnosis_Text, Chief_Complaint, Age, Gender, Visit_Date_Time, Row_Number FROM TC_Meaningful_Use_Base WHERE Visit_Date_Time BETWEEN '2013-02-01 00:00:00' AND '2013-10-31 23:59:59' AND (Diagnosis_Code LIKE '%E81_%') AND Age <= 35;
Run Query # Run Query df.auto <- dbGetQuery(con, query) # Fix dates using lubridate df.auto$Visit_Date_Time <- ymd_hms(df.auto$Visit_Date_Time) # Create a date column df.auto$Visit_Date <- as.Date(df.auto$Visit_Date_Time)
Bin Ages # Drop all rows where age is greater than 35 years or is undefined df.auto <- df.auto[!is.na(df.auto$Age),] df.auto <- df.auto[df.auto$Age <= 35,] # Bin ages df.auto$Age_binned <- cut(df.auto$Age, breaks=c(0, 15, 20, 25, 30, 35), include.lowest=T)
Histogram of Visits by Age Group ggplot(df.auto) + aes(Age_binned) + geom_histogram()
Create Summary Data Set df.auto.daily.counts <- ddply(df.auto, .(Visit_Date, Age_binned), summarize, count=length(Chief_Complaint)) ################################ # Visit_Date Age_binned count # 1 2013-02-01 [0,15] 3 # 2 2013-02-01 (15,20] 25 # 3 2013-02-01 (20,25] 16 # 4 2013-02-01 (25,30] 16 # 5 2013-02-02 [0,15] 13 # 6 2013-02-02 (15,20] 6 ###############################
Visits per Week by Age ggplot(df.auto.daily.counts) + aes(x = Visit_Date, y = count, color=Age_binned) + geom_line(size=2, alpha=.7)
Visits per Week by Age ggplot(df.auto.daily.counts) + aes(x = Visit_Date, y = count, color=Age_binned) + geom_smooth(size=3, alpha=.7)