The purpose of this script is to generate a list of UK Biobank participants which meet QC/filtering criteria:

  1. Genetic ethnicity = Caucasian via Pan UKBB designation
  2. Not an outlier for heterogeneity and missing genotype rate (poor quality genotype)
  3. No sex chromosome aneuploidy
  4. Self-reported sex matches genetic sex
  5. Do not have high degree of genetic kinship (ten or more third-degree relatives identified)
  6. Does not appear in “maximum_set_of_unrelated_individuals.pl”

Load packages + make it so table always displays NA’s

suppressMessages(silent <- lapply(
    c("plyr", "dplyr", "tidyverse", "data.table", "vroom", "knitr"), 
    library, character.only=T))
table = function (..., useNA = 'always') base::table(..., useNA = useNA)

Load UK Biobank data

## [1] 502527   5172

This code chunk has been modified for display to hide the 5-digit code that came with our data table

bd=vroom("/Users/mike/Documents/R_files/UKBpheno/pheno/ukbXXXXX.tab", delim="\t", show_col_types = FALSE)
source("src/components/ukbXXXXX_factordata.R") #file provided by UKB "ukbxxxxx_loaddata.R" without the loading part, to label the responses in survey questions
bd=as_tibble(bd)
dim(bd)
withdrawn<-read.csv("src/components/w48818_20220222.csv", header = FALSE)
bd=bd[!(bd$f.eid %in% withdrawn$V1), ]

Load the files that came from Pan UKBB, list of participants and bridge (key)

pan<-read_tsv("src/components/PanUKBB/all_pops_non_eur_pruned_within_pop_pc_covs.tsv")
## Rows: 448216 Columns: 28
## ── Column specification ─────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): pop
## dbl (26): s, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10, PC11, PC12, ...
## lgl  (1): related
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pan<-as_tibble(pan)
pan$s<-as.integer(pan$s)
table(pan$pop)%>%kbl()
Var1 Freq
AFR 6805
AMR 996
CSA 9109
EAS 2783
EUR 426901
MID 1622
NA 0
bridge<-read.table("src/components/PanUKBB/ukb48818bridge31063.txt")
bridge<-as_tibble(bridge)
colnames(bridge)<-c("IID", "panID")

pan2<-pan%>%select(s, pop)%>%
    left_join(bridge, by=c("s"="panID"))

Retrieve relevant QC columns from UKB

bd_QC<- bd %>% select(f.eid, f.31.0.0, f.22001.0.0, f.21000.0.0, f.22027.0.0, f.22019.0.0, f.22021.0.0)

colnames(bd_QC)<-c("IID", "Sex", "Genetic_Sex", "Race", "Outliers_for_het_or_missing", "SexchrAneuploidy", "Genetic_kinship")

Join UKB cols with with Pan UKBB

bd_QC<-as_tibble(bd_QC) #502,527 
nrow(bd_QC) #[1] 502527 
## [1] 502527
bd_QC<-bd_QC%>%inner_join(pan2, by="IID")

Filter by selected criteria

#Filter by Genetic ethnicity = Caucasian VIA PAN UKBB 
bd_QC<-bd_QC[bd_QC$pop=="EUR",] #nrow(bd_QC) #[1] 426881

bd_QC<-bd_QC%>% filter(is.na(Outliers_for_het_or_missing) | Outliers_for_het_or_missing !="Yes") #nrow(bd_QC) #[1] 426433

bd_QC<-bd_QC%>% filter(is.na(SexchrAneuploidy) | SexchrAneuploidy != "Yes") #nrow(bd_QC) #[1] 425854

bd_QC<- bd_QC%>% filter(is.na(Genetic_kinship) | Genetic_kinship != "Ten or more third-degree relatives identified")

#If Sex does not equal genetic sex, exclude participant 
bd_QC<-bd_QC[bd_QC$Sex == bd_QC$Genetic_Sex,] 
nrow(bd_QC) #[1] 425683
## [1] 426373

Save list of participants for use in future filtering steps

QCkeepparticipants<-bd_QC%>%select(IID)

write.table(QCkeepparticipants, file= "bd_QC-keep.txt", row.names = FALSE, quote = FALSE)