The purpose of this script is to generate a list of UK Biobank participants which meet QC/filtering criteria:
suppressMessages(silent <- lapply(
c("plyr", "dplyr", "tidyverse", "data.table", "vroom", "knitr"),
library, character.only=T))
table = function (..., useNA = 'always') base::table(..., useNA = useNA)## [1] 502527 5172
This code chunk has been modified for display to hide the 5-digit code that came with our data table
bd=vroom("/Users/mike/Documents/R_files/UKBpheno/pheno/ukbXXXXX.tab", delim="\t", show_col_types = FALSE)
source("src/components/ukbXXXXX_factordata.R") #file provided by UKB "ukbxxxxx_loaddata.R" without the loading part, to label the responses in survey questions
bd=as_tibble(bd)
dim(bd)
withdrawn<-read.csv("src/components/w48818_20220222.csv", header = FALSE)
bd=bd[!(bd$f.eid %in% withdrawn$V1), ]pan<-read_tsv("src/components/PanUKBB/all_pops_non_eur_pruned_within_pop_pc_covs.tsv")## Rows: 448216 Columns: 28
## ── Column specification ─────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): pop
## dbl (26): s, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10, PC11, PC12, ...
## lgl (1): related
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pan<-as_tibble(pan)
pan$s<-as.integer(pan$s)
table(pan$pop)%>%kbl()| Var1 | Freq |
|---|---|
| AFR | 6805 |
| AMR | 996 |
| CSA | 9109 |
| EAS | 2783 |
| EUR | 426901 |
| MID | 1622 |
| NA | 0 |
bridge<-read.table("src/components/PanUKBB/ukb48818bridge31063.txt")
bridge<-as_tibble(bridge)
colnames(bridge)<-c("IID", "panID")
pan2<-pan%>%select(s, pop)%>%
left_join(bridge, by=c("s"="panID"))bd_QC<- bd %>% select(f.eid, f.31.0.0, f.22001.0.0, f.21000.0.0, f.22027.0.0, f.22019.0.0, f.22021.0.0)
colnames(bd_QC)<-c("IID", "Sex", "Genetic_Sex", "Race", "Outliers_for_het_or_missing", "SexchrAneuploidy", "Genetic_kinship")bd_QC<-as_tibble(bd_QC) #502,527
nrow(bd_QC) #[1] 502527 ## [1] 502527
bd_QC<-bd_QC%>%inner_join(pan2, by="IID")#Filter by Genetic ethnicity = Caucasian VIA PAN UKBB
bd_QC<-bd_QC[bd_QC$pop=="EUR",] #nrow(bd_QC) #[1] 426881
bd_QC<-bd_QC%>% filter(is.na(Outliers_for_het_or_missing) | Outliers_for_het_or_missing !="Yes") #nrow(bd_QC) #[1] 426433
bd_QC<-bd_QC%>% filter(is.na(SexchrAneuploidy) | SexchrAneuploidy != "Yes") #nrow(bd_QC) #[1] 425854
bd_QC<- bd_QC%>% filter(is.na(Genetic_kinship) | Genetic_kinship != "Ten or more third-degree relatives identified")
#If Sex does not equal genetic sex, exclude participant
bd_QC<-bd_QC[bd_QC$Sex == bd_QC$Genetic_Sex,]
nrow(bd_QC) #[1] 425683## [1] 426373
QCkeepparticipants<-bd_QC%>%select(IID)
write.table(QCkeepparticipants, file= "bd_QC-keep.txt", row.names = FALSE, quote = FALSE)