The purpose of this script is to generate a list of UK Biobank participants which meet QC/filtering criteria:
suppressMessages(silent <- lapply(
c("plyr", "dplyr", "tidyverse", "data.table", "vroom", "knitr"),
character.only=T))
library, = function (..., useNA = 'always') base::table(..., useNA = useNA) table
## [1] 502527 5172
This code chunk has been modified for display to hide the 5-digit code that came with our data table
=vroom("/Users/mike/Documents/R_files/UKBpheno/pheno/ukbXXXXX.tab", delim="\t", show_col_types = FALSE)
bdsource("src/components/ukbXXXXX_factordata.R") #file provided by UKB "ukbxxxxx_loaddata.R" without the loading part, to label the responses in survey questions
=as_tibble(bd)
bddim(bd)
<-read.csv("src/components/w48818_20220222.csv", header = FALSE)
withdrawn=bd[!(bd$f.eid %in% withdrawn$V1), ] bd
<-read_tsv("src/components/PanUKBB/all_pops_non_eur_pruned_within_pop_pc_covs.tsv") pan
## Rows: 448216 Columns: 28
## ── Column specification ─────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): pop
## dbl (26): s, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10, PC11, PC12, ...
## lgl (1): related
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<-as_tibble(pan)
pan$s<-as.integer(pan$s)
pantable(pan$pop)%>%kbl()
Var1 | Freq |
---|---|
AFR | 6805 |
AMR | 996 |
CSA | 9109 |
EAS | 2783 |
EUR | 426901 |
MID | 1622 |
NA | 0 |
<-read.table("src/components/PanUKBB/ukb48818bridge31063.txt")
bridge<-as_tibble(bridge)
bridgecolnames(bridge)<-c("IID", "panID")
<-pan%>%select(s, pop)%>%
pan2left_join(bridge, by=c("s"="panID"))
<- bd %>% select(f.eid, f.31.0.0, f.22001.0.0, f.21000.0.0, f.22027.0.0, f.22019.0.0, f.22021.0.0)
bd_QC
colnames(bd_QC)<-c("IID", "Sex", "Genetic_Sex", "Race", "Outliers_for_het_or_missing", "SexchrAneuploidy", "Genetic_kinship")
<-as_tibble(bd_QC) #502,527
bd_QCnrow(bd_QC) #[1] 502527
## [1] 502527
<-bd_QC%>%inner_join(pan2, by="IID") bd_QC
#Filter by Genetic ethnicity = Caucasian VIA PAN UKBB
<-bd_QC[bd_QC$pop=="EUR",] #nrow(bd_QC) #[1] 426881
bd_QC
<-bd_QC%>% filter(is.na(Outliers_for_het_or_missing) | Outliers_for_het_or_missing !="Yes") #nrow(bd_QC) #[1] 426433
bd_QC
<-bd_QC%>% filter(is.na(SexchrAneuploidy) | SexchrAneuploidy != "Yes") #nrow(bd_QC) #[1] 425854
bd_QC
<- bd_QC%>% filter(is.na(Genetic_kinship) | Genetic_kinship != "Ten or more third-degree relatives identified")
bd_QC
#If Sex does not equal genetic sex, exclude participant
<-bd_QC[bd_QC$Sex == bd_QC$Genetic_Sex,]
bd_QCnrow(bd_QC) #[1] 425683
## [1] 426373
<-bd_QC%>%select(IID)
QCkeepparticipants
write.table(QCkeepparticipants, file= "bd_QC-keep.txt", row.names = FALSE, quote = FALSE)