Participant QC

The purpose of this script is to generate a list of UK Biobank participants which meet QC/filtering criteria:

Genetic ethnicity = Caucasian via Pan UKBB designation
Not an outlier for heterogeneity and missing genotype rate (poor quality genotype)
No sex chromosome aneuploidy
Self-reported sex matches genetic sex
Do not have high degree of genetic kinship (ten or more third-degree relatives identified)
Does not appear in “maximum_set_of_unrelated_individuals.pl”

Load packages + make it so table always displays NA’s

suppressMessages(silent <- lapply(
    c("plyr", "dplyr", "tidyverse", "data.table", "vroom", "knitr"), 
    library, character.only=T))
table = function (..., useNA = 'always') base::table(..., useNA = useNA)

Load UK Biobank data

## [1] 502527   5172

This code chunk has been modified for display to hide the 5-digit code that came with our data table

bd=vroom("/Users/mike/Documents/R_files/UKBpheno/pheno/ukbXXXXX.tab", delim="\t", show_col_types = FALSE)
source("src/components/ukbXXXXX_factordata.R") #file provided by UKB "ukbxxxxx_loaddata.R" without the loading part, to label the responses in survey questions
bd=as_tibble(bd)
dim(bd)
withdrawn<-read.csv("src/components/w48818_20220222.csv", header = FALSE)
bd=bd[!(bd$f.eid %in% withdrawn$V1), ]

Load the files that came from Pan UKBB, list of participants and bridge (key)

pan<-read_tsv("src/components/PanUKBB/all_pops_non_eur_pruned_within_pop_pc_covs.tsv")

## Rows: 448216 Columns: 28
## ── Column specification ─────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): pop
## dbl (26): s, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10, PC11, PC12, ...
## lgl  (1): related
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

pan<-as_tibble(pan)
pan$s<-as.integer(pan$s)
table(pan$pop)%>%kbl()

Var1	Freq
AFR	6805
AMR	996
CSA	9109
EAS	2783
EUR	426901
MID	1622
NA	0

bridge<-read.table("src/components/PanUKBB/ukb48818bridge31063.txt")
bridge<-as_tibble(bridge)
colnames(bridge)<-c("IID", "panID")

pan2<-pan%>%select(s, pop)%>%
    left_join(bridge, by=c("s"="panID"))

Retrieve relevant QC columns from UKB

bd_QC<- bd %>% select(f.eid, f.31.0.0, f.22001.0.0, f.21000.0.0, f.22027.0.0, f.22019.0.0, f.22021.0.0)

colnames(bd_QC)<-c("IID", "Sex", "Genetic_Sex", "Race", "Outliers_for_het_or_missing", "SexchrAneuploidy", "Genetic_kinship")

Join UKB cols with with Pan UKBB

bd_QC<-as_tibble(bd_QC) #502,527 
nrow(bd_QC) #[1] 502527

## [1] 502527

bd_QC<-bd_QC%>%inner_join(pan2, by="IID")

Filter by selected criteria

#Filter by Genetic ethnicity = Caucasian VIA PAN UKBB 
bd_QC<-bd_QC[bd_QC$pop=="EUR",] #nrow(bd_QC) #[1] 426881

bd_QC<-bd_QC%>% filter(is.na(Outliers_for_het_or_missing) | Outliers_for_het_or_missing !="Yes") #nrow(bd_QC) #[1] 426433

bd_QC<-bd_QC%>% filter(is.na(SexchrAneuploidy) | SexchrAneuploidy != "Yes") #nrow(bd_QC) #[1] 425854

bd_QC<- bd_QC%>% filter(is.na(Genetic_kinship) | Genetic_kinship != "Ten or more third-degree relatives identified")

#If Sex does not equal genetic sex, exclude participant 
bd_QC<-bd_QC[bd_QC$Sex == bd_QC$Genetic_Sex,] 
nrow(bd_QC) #[1] 425683

## [1] 426373

Filter related file by those in QC from maximum_set_of_unrelated_individuals.pl¹ output:

max_unrelated<- read.table("src/components/ukb48818_rel_s488282_output.dat") 
max_unrelated<-as.integer(unlist(max_unrelated)) 

bd_QC<-bd_QC%>%filter(!IID %in% max_unrelated)

table(bd_QC$pop) #Pan UKBB designation

## 
##    EUR   <NA> 
## 357778      0

table(bd_QC$Race) #self identified

## 
##       Prefer not to answer                Do not know 
##                        872                         64 
##                      White                      Mixed 
##                        357                          4 
##     Asian or Asian British     Black or Black British 
##                          0                          1 
##                    Chinese         Other ethnic group 
##                          1                        187 
##                    British                      Irish 
##                     342708                       7098 
## Any other white background  White and Black Caribbean 
##                       6203                          4 
##    White and Black African            White and Asian 
##                          3                         16 
## Any other mixed background                     Indian 
##                         81                          2 
##                  Pakistani                Bangladeshi 
##                          0                          0 
## Any other Asian background                  Caribbean 
##                          0                          0 
##                    African Any other Black background 
##                          0                          0 
##                       <NA> 
##                        177

Save list of participants for use in future filtering steps

QCkeepparticipants<-bd_QC%>%select(IID)

write.table(QCkeepparticipants, file= "bd_QC-keep.txt", row.names = FALSE, quote = FALSE)

Max unrelated script ↩︎

Participant QC

Mike Francis

2022-10-09 22:11:47

Load packages + make it so table always displays NA’s

Load UK Biobank data

Load the files that came from Pan UKBB, list of participants and bridge (key)

Retrieve relevant QC columns from UKB

Join UKB cols with with Pan UKBB

Filter by selected criteria

Save list of participants for use in future filtering steps

Participant QC

Mike Francis

2022-10-09 22:11:47

Load packages + make it so table always displays NA’s

Load UK Biobank data

Load the files that came from Pan UKBB, list of participants and bridge (key)

Retrieve relevant QC columns from UKB

Join UKB cols with with Pan UKBB

Filter by selected criteria

Filter related file by those in QC from maximum_set_of_unrelated_individuals.pl1 output:

Save list of participants for use in future filtering steps

Filter related file by those in QC from maximum_set_of_unrelated_individuals.pl¹ output: