Defining “high quality” vegetarians for use in this analysis

Purpose of this script:

  • Find the self-identified vegetarians in the UK Biobank dietary recall survey instances

  • Assess their consistency with other parts of the dietary survey.

  • Obtain a list of high quality vegetarians to use for subsequent analyses.


Veg5yr1 is the vegetarian variable used in the main analysis. Being categorized as a vegetarian means participants adhered to all four of these criteria:

  • Self-identified as a vegetarian or vegan the first time they were asked on a 24HR survey

  • Didn’t indicate eating any meat or fish on that same first 24HR survey

  • Indicated no major dietary changes over past 5 years

  • No meat or fish eating indicated on initial assessment


The biomarker data that we use in this analysis was taken at the IA. Therefore this is a combination of several columns that describe the quality of vegetarianism at that same time point.

There are also some plots here showing general qualities of the 24HR recall survey that were done in my data exploration that don’t factor into the main analysis (i.e., Plot: How many repeat participants were in each instance of the 24HR? , Plot: Percent consistent vegetarians vs number of recall survey’s taken across 3 years, [Identify those who were vegetarian at two consecutive 24HR instances])

Load data

Load packages + special functions

suppressMessages(silent <- lapply(
    c("plyr", "dplyr", "tidyverse", "data.table", "vroom", "knitr","kableExtra","cowplot"), 
    library, character.only=T))

table = function (..., useNA = 'always') base::table(..., useNA = useNA)

mutate_when <- function(data, ...) {
    dots <- eval(substitute(alist(...)))
    for (i in seq(1, length(dots), by = 2)) {
        condition <- eval(dots[[i]], envir = data)
        mutations <- eval(dots[[i + 1]], 
                          envir = data[condition, , drop = FALSE])
        data[condition, names(mutations)] <- mutations
    }
    data
}

Load UK Biobank dataset

## [1] 502527   5172
bd=vroom("/Users/mike/Documents/R_files/UKBpheno/pheno/ukbXXXXX.tab", delim="\t", show_col_types = FALSE)
bd=as_tibble(bd)
source("src/components/ukbXXXXX_factordata.R") #file provided by UKB "ukbXXXXX_loaddata.R" without the loading part, to label the responses in survey questions
dim(bd)

Identify instances where 24 hour dietary recall (24HR) was taken by participants. Possible instances are 0 through 4.

“daycols” are days of the week that the 24HR was taken, i.e. a proxy for whether the survey was completed. In these columns, NA=survey wasn’t taken that instance.

We change the days of the week to 1 because we’re not interested in which day it actually was.

#First, remove withdrawn participants
withdrawn=read.csv("src/components/w48818_20220222.csv", header = FALSE)
nrow(withdrawn)
## [1] 114
bd=bd[!(bd$f.eid %in% withdrawn$V1), ]
nrow(bd)
## [1] 502413
#Identify columns with days of the week
daycols<-c("f.20080.0.0", "f.20080.1.0", "f.20080.2.0",
           "f.20080.3.0", "f.20080.4.0")

bd1<-bd[,c("f.eid", daycols)]
#Change all the factors to characters
bd1[,daycols]<-sapply(bd1[,daycols], as.character)

#Change NA's to zeros and days to 1's
bd1[,daycols][(is.na(bd1[,daycols]))]<-"0"
bd1[,daycols][bd1[,daycols]!="0"] <-"1"

#Change these back to numeric
bd1[,daycols]<-sapply(bd1[,daycols], as.integer)

#Now make a new column, everyone with rowSums zero never took recall
#and those with >0 took it at least once
bd1$times_taken_24HR<-rowSums(bd1[,daycols])
bd1$ever_took_24HR<-0
bd1$ever_took_24HR[bd1$times_taken_24HR>0]<-1
bd1$times_taken_24HR[bd1$times_taken_24HR==0]<-NA

sum(bd1$ever_took_24HR)
## [1] 210967
#TP = time point
colnames(bd1)<-c("IID", "RecallTP0", "RecallTP1",
                 "RecallTP2","RecallTP3","RecallTP4",
                 "times_taken_24HR", "ever_took_24HR")
#Optional save here
#write.csv(bd1, "UKB_24hourRecall-ParticipantInstancesTaken.csv", quote=F, row.names=F)

During what time periods did the questionnaire cycles occur?

Plot: How many times did participants take 24HR surveys

perinstance=bd1%>%select(RecallTP0:RecallTP4)


timestaken<-bd1%>%select(times_taken_24HR)%>%
    group_by(times_taken_24HR)%>%summarise(N=n())

timestaken%>%kbl()
times_taken_24HR N
1 84145
2 48108
3 42482
4 30467
5 5765
NA 291446
p1=ggplot(data=timestaken, aes(x=times_taken_24HR, y=N)) +
    geom_bar(stat="identity", width=0.5, fill ="#8C8C8C")+
    ylab("Number of participants")+
    xlab("Number of 24 hour recall surveys taken")+
    ggtitle(paste("N = ", sum(timestaken$N[1:5]), sep=""))+
    theme_bw()

p1
## Warning: Removed 1 rows containing missing values (position_stack).

Plot: How many repeat participants were in each instance of the 24HR?

take=bd1[paste("RecallTP", 0:4, sep="")]
take=take[!rowSums(take)==0,]

take.plot=take%>% 
    gather(key = v, value = total, RecallTP0:RecallTP4, na.rm = T)%>%
    group_by(v)%>%count(total)%>%filter(total==1)

colnames(take.plot)=c("RecallTP", "Group", "n")
take.plot$RecallTP=0:4
as_tibble(take.plot)%>%kbl()
RecallTP Group n
0 1 70692
1 1 100577
2 1 83243
3 1 103765
4 1 100223

This next script chunk is incredibly repetitive and I’m sure there is a more efficient way to write it but it was faster for me to just write it “manually”.

rep=paste("repeat", 0:4, sep="")
take[rep]=0
take[rep[1]]=0
nrow(take[!rowSums(take)==0,])
## [1] 210967
#TP1, previously took once
take$repeat1[take$RecallTP0==1 &
                 take$RecallTP1==1] = 1

#TP2, previously took at least once
take$repeat2[(take$RecallTP0==1 | 
                  take$RecallTP1==1) &
                 take$RecallTP2==1] = 1

#TP2, previously took twice
take$repeat2[(take$RecallTP0==1 & 
                  take$RecallTP1==1) &
                 take$RecallTP2==1] = 2

#TP3, previously took at least once
take$repeat3[(take$RecallTP0==1 | 
                take$RecallTP1==1| 
                take$RecallTP2==1) &
                 take$RecallTP3==1] = 1

#TP3, previously took at least twice
take$repeat3[(
    (take$RecallTP0==1 & take$RecallTP1==1) | 
    (take$RecallTP1==1 & take$RecallTP2==1) |
    (take$RecallTP0==1 & take$RecallTP2==1)
    ) & take$RecallTP3==1] = 2

#TP3, previously took three times
take$repeat3[(
            take$RecallTP0==1 &  
            take$RecallTP1==1  & 
             take$RecallTP2==1 &
             take$RecallTP3==1) ] = 3

#TP4, previously took at least once
take$repeat4[(take$RecallTP0==1 | 
                  take$RecallTP1==1| 
                  take$RecallTP2==1| 
                  take$RecallTP3==1) &
                 take$RecallTP4==1] = 1

#TP4, previously took at least twice
take$repeat4[(
    (take$RecallTP0==1 & take$RecallTP1==1) | 
    (take$RecallTP0==1 & take$RecallTP2==1) |
    (take$RecallTP0==1 & take$RecallTP3==1) |
    (take$RecallTP1==1 & take$RecallTP2==1) | 
    (take$RecallTP1==1 & take$RecallTP3==1) |
    (take$RecallTP2==1 & take$RecallTP3==1) 
    ) & take$RecallTP4==1] = 2

#TP4, previously took at least three times
take$repeat4[(
(take$RecallTP0==1 & take$RecallTP1==1 & take$RecallTP2==1) |
(take$RecallTP0==1 & take$RecallTP2==1 & take$RecallTP3==1) |
(take$RecallTP1==1 & take$RecallTP2==1 & take$RecallTP3==1) 
) & take$RecallTP4==1] = 3

#TP4, previously took four times
take$repeat4[
    take$RecallTP0==1 & take$RecallTP1==1
     & take$RecallTP2==1 & take$RecallTP3==1 
    & take$RecallTP4==1] = 4

#NA for those who didn't take the recall in that instance
take$repeat0[take$RecallTP0==0]=NA
take$repeat1[take$RecallTP1==0]=NA
take$repeat2[take$RecallTP2==0]=NA
take$repeat3[take$RecallTP3==0]=NA
take$repeat4[take$RecallTP4==0]=NA

sapply(take, table)[6:10] #Count repeats at each instance
## $repeat0
## 
##      0   <NA> 
##  70692 140275 
## 
## $repeat1
## 
##      0      1   <NA> 
##  79599  20978 110390 
## 
## $repeat2
## 
##      0      1      2   <NA> 
##  27105  46006  10132 127724 
## 
## $repeat3
## 
##      0      1      2      3   <NA> 
##  19639  38856  37764   7506 107202 
## 
## $repeat4
## 
##      0      1      2      3      4   <NA> 
##  13932  20982  34122  25422   5765 110744
take.plot=data.frame(0:4, sapply(take, sum)[1:5], row.names = NULL)
colnames(take.plot)=c("RecallTP", "N")

y=data.frame()
for (i in 0:4){
    x=take%>%group_by_(paste("RecallTP", i,sep=""), paste("repeat",i,sep=""))%>%
        summarise(count=n())%>%
        rename_("RecallTP" = paste("RecallTP", i,sep=""))%>%
        rename_("group" = paste("repeat", i,sep=""))
    x=x[x$RecallTP==1,]
    x$RecallTP=i
    y=rbind(y,x)
}
## `summarise()` has grouped output by 'RecallTP0'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'RecallTP1'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'RecallTP2'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'RecallTP3'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'RecallTP4'. You can override using the `.groups` argument.
y$group=mapvalues(y$group, from=c(0,1,2,3,4), 
                  to=c("First time taking",
                       "Second time taking",
                       "Third time taking",
                       "Fourth time taking",
                       "Fifth time taking"))

y$group=factor(y$group, 
               levels =rev(unique(y$group)))


p2=ggplot(data=y, aes(x=RecallTP, y=count, fill=group, color=group)) +
    geom_bar(stat="identity", width=0.5, size=1)+
    ylab("Count participants")+
    xlab("Instance of 24 hour recall")+
    theme_bw()+
    guides(fill = guide_legend(reverse = TRUE),
           color=guide_legend(reverse = TRUE))+
    theme(legend.title=element_blank())+
     scale_fill_brewer(palette="Spectral")+
    scale_colour_manual(guide="none", values=c(rep("NA", 4), "black"))

p2

Continue with veg definitions: start a new dataframe called “new” and add Age and Sex columns

new<-bd1

#PUT AGE AND SEX COLUMNS
agesex<-bd%>%select(f.eid, f.21003.0.0, f.31.0.0)
colnames(agesex)<-c("IID", "Age", "Sex")
new<-left_join(new,agesex)%>%select(IID, Age, Sex, everything())
## Joining, by = "IID"

Add “major dietary change” column

f.1538 = “Have you made any major changes to your diet in the last 5 years?”

  • 0 = “No”

  • 1= “Yes, because of illness | Yes, because of other reasons | Prefer not to answer”

changevar=bd%>%select(f.eid, f.1538.0.0)
colnames(changevar)=c("IID", "DietChanges5yr")

changevar[2]<-sapply(changevar[2], mapvalues, 
                         c("Prefer not to answer", 
                           "Yes, because of illness", "Yes, because of other reasons",
                           "No"),
                         c(1,1,1,0))
table(changevar$DietChanges5yr)%>%kbl()
Var1 Freq
0 303424
1 198092
NA 897
new=left_join(new,changevar)
## Joining, by = "IID"

Check meat and fish eater fields of initial assessment (IA).

#Category 100052: Diet. Initial assessment
IAcols<-c("f.1329.0.0", "f.1339.0.0", 
          "f.1349.0.0", "f.1359.0.0", 
          "f.1369.0.0", "f.1379.0.0", 
          "f.1389.0.0")

names(IAcols)<-c("IAOilyfish", "IANonoilyfish",
                 "IAProcessedmeat","IAPoultry",
                 "IABeef","IALamb",
                 "IAPork")

IAcols.new<-as_tibble(sapply(bd[IAcols], as.character))
colnames(IAcols.new)<-names(IAcols)

#Put a TRUE in this column IAveg if they answered Never in all the meat+fish columns
IAcols.new$IAveg<-rowSums(bd[IAcols]=="Never")==7 

table(IAcols.new$IAveg)%>%kbl()
Var1 Freq
FALSE 492390
TRUE 9126
NA 897
IAcols.new$IID<-bd$f.eid
new<-left_join(new, IAcols.new)
## Joining, by = "IID"

Add the column “Age when last ate meat”, which wasn’t used in the analysis.

# IA Age when last ate meat/"Never eaten meat" -----------------------
#https://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=3680
#"How old were you when you last ate any kind of meat? 
#(Enter "0" if you have never eaten meat in your lifetime)"
#on the initial assessment
IA.ageLastAteMeat<-bd%>%select(f.eid, f.3680.0.0)
colnames(IA.ageLastAteMeat)<-c("IID", "IAageLastAteMeat")
new<-left_join(new, IA.ageLastAteMeat)
## Joining, by = "IID"
neverate=new%>%select(IAageLastAteMeat, IAOilyfish:IAPork)%>%filter(IAageLastAteMeat==0)

sapply(neverate, table)
## $IAageLastAteMeat
## 
##    0 <NA> 
## 1230    0 
## 
## $IAOilyfish
## 
##      2-4 times a week      5-6 times a week           Do not know 
##                    28                     5                     7 
## Less than once a week                 Never           Once a week 
##                    49                  1098                    40 
##    Once or more daily                  <NA> 
##                     3                     0 
## 
## $IANonoilyfish
## 
##      2-4 times a week      5-6 times a week           Do not know 
##                    44                     3                     6 
## Less than once a week                 Never           Once a week 
##                    44                  1069                    59 
##    Once or more daily  Prefer not to answer                  <NA> 
##                     4                     1                     0 
## 
## $IAProcessedmeat
## 
## Never  <NA> 
##  1230     0 
## 
## $IAPoultry
## 
## Never  <NA> 
##  1230     0 
## 
## $IABeef
## 
## Never  <NA> 
##  1230     0 
## 
## $IALamb
## 
## Never  <NA> 
##  1230     0 
## 
## $IAPork
## 
## Never  <NA> 
##  1230     0
lev=c("Once or more daily","5-6 times a week","2-4 times a week",
       "Once a week","Less than once a week", 
       "Do not know", "Never")


neverate$IAOilyfish<-factor(neverate$IAOilyfish, levels=lev)
neverate$IANonoilyfish<-factor(neverate$IANonoilyfish, levels=lev)
x1<-neverate%>%group_by(IAOilyfish)%>%summarise(N=n())%>%mutate(Category="Oily fish")
x2<-neverate%>%group_by(IANonoilyfish)%>%summarise(N=n())%>%mutate(Category="Non-oily fish")
colnames(x1)[1]="Frequency"
colnames(x2)[1]="Frequency"
neverate.plot<-rbind(x1, x2)
neverate.plot<-neverate.plot[-15,] #take out the "prefer not to answer" 1 person

ggplot(data=neverate.plot, aes(x=Category, y=N, fill=Frequency)) +
    geom_bar(position="stack", stat="identity", width=0.5)+
    ylab("Number of participants")+
    ggtitle(paste('Fish eating frequency of those who, on the same survey, 
Answered 0 to:
            "How old were you when you last ate any kind of meat? 
            (Enter "0" if you have never eaten meat in your lifetime)"
N=', nrow(neverate)-1, sep=""))+
    theme_bw()+
     scale_fill_brewer(palette="Spectral")

Special diet column in 24HR where participants self-identified as vegetarian/vegan

This next section solves the problem that if someone has an NA in field 20086 (24hr-recall: special diet followed), it could mean two possible things:

  • They did not answer this question in the affirmative or

  • They didn’t take the 24h recall in that instance,

since it is marked as NA in both of these situations. (I.e. there is no “I didn’t follow a special diet” response in UKB Data-Coding 76).

Therefore response in this column was compared to the column indicating whether they took the survey in that instance to get that information.1

The question on the survey:

Special<-c("f.eid", "f.20086.0.0", "f.20086.1.0",
                "f.20086.2.0", "f.20086.3.0", "f.20086.4.0")

names(Special)<- c("IID", "SpecialVeg0","SpecialVeg1", "SpecialVeg2", 
                   "SpecialVeg3", "SpecialVeg4")

Special.cols<-as_tibble(sapply(bd[Special], as.character))
colnames(Special.cols)<-names(Special)

The array index for f.20086 runs from 0 to 5 (https://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=20086). This handy piece of code will consolidate anyone who said vegetarian in indices 1:5 and put that information in array 0 of the corresponding survey instance so we don’t have to worry about the additional array slots for those people that had two special diets.

It also seems that at instance 0 UKB had potentially not enabled this “multiple special diets” feature as f.20086.0.1:f.20086.0.5 are all empty in all participants. However in instances 1:4 there are many participants who indicated multiple special diets.

#to see that f.20086.0.1:f.20086.0.5 are all empty:
#sapply(bd%>%select(f.20086.0.1:f.20086.0.5), table)

#For vegetarians
for (i in 0:4){
    cols.i=(sprintf("f.20086.%1.0f.%1.0f", i,1:5))
    #cols.0=(sprintf("f.20086.%1.0f.%1.0f", i,0))
    Special.cols[sprintf("SpecialVeg%1.0f", i)][
            apply(bd[cols.i], 1, 
            function(x) any(x %in% c("Vegetarian"))),
        ]="Vegetarian"
}

#For vegans
for (i in 0:4){
    cols.i=(sprintf("f.20086.%1.0f.%1.0f", i,1:5))
    #cols.0=(sprintf("f.20086.%1.0f.%1.0f", i,0))
    Special.cols[sprintf("SpecialVeg%1.0f", i)][
            apply(bd[cols.i], 1, 
            function(x) any(x %in% c("Vegan"))),
        ]="Vegan"
    }

Vegetarian = 1 ; Vegan = 1; all others and NA get 0 here, Will switch “true NAs” who didn’t take the survey at this instance back to NA in next step

Special.veg=Special.cols
Special.veg[2:6]<-sapply(Special.cols[2:6], mapvalues, 
                    c(NA, "Low calorie", "Gluten-free", 
                      "Lactose-free", "Other",
                      "Vegetarian", "Vegan"),
                    c(0,0,0,0,0,1,1))
Special.veg<-as_tibble(sapply(Special.veg, as.numeric))
new<-left_join(new, Special.veg)
## Joining, by = "IID"

Switch those zeros in Special Veg columns back to NAs for those who didn’t take survey at that instance

for (i in 0:4){
    new[paste("SpecialVeg", i, sep="")][
        new[paste("RecallTP", i, sep="")]==0
    ]=NA
}

Make separate column indicating self-identified vegans

Special.vegan<-Special.cols
Special.vegan[2:6]<-sapply(Special.cols[2:6], mapvalues, 
                         c(NA, "Low calorie", "Gluten-free", 
                           "Lactose-free", "Other",
                           "Vegetarian", "Vegan"),
                         c(0,0,0,0,0,0,1))
colnames(Special.vegan)=c("IID", "SpecialVegan0","SpecialVegan1", 
                          "SpecialVegan2", 
                        "SpecialVegan3", "SpecialVegan4")
Special.vegan<-as_tibble(sapply(Special.vegan, as.numeric))
new<-left_join(new, Special.vegan)
## Joining, by = "IID"
#Switch those zeros in Special Veg columns back to NAs 
#for those who didn't take survey at that instance
for (i in 0:4){
    new[paste("SpecialVegan", i, sep="")][
        new[paste("RecallTP", i, sep="")]==0
    ]=NA
}

Find proportion of times each participant answered vegetarian on 24H

new$SpecialTimesVeg=ifelse(
    apply(
        is.na(new[paste("SpecialVeg", 0:4, sep="")]),1,all),NA,
    rowSums(new[paste("SpecialVeg", 0:4, sep="")], na.rm = T
    ))

#Proportion of times taken 24HR that answered vegetarian
new$SpecialPropVeg=
    new$SpecialTimesVeg/new$times_taken_24HR

Plot: Percent consistent vegetarians vs number of recall survey’s taken across 3 years of 24HR surveys

Tracking the percentage of participants who self-identified as vegetarian every time they took the 24HR (100% of responses were vegetarian/vegan) versus the number of times the recall was taken by said participants

#People who said they were vegetarian at least once
times<-new%>%select(times_taken_24HR, SpecialTimesVeg)%>%
    filter(complete.cases(.), SpecialTimesVeg>0)

nveg=nrow(times)

times.res<-matrix(data=NA, nrow=5, ncol=4)
for (i in 1:5){
    times.res[i,1]<-i
    times.res[i,2]<-nrow(times%>%filter(times_taken_24HR==i)) 
    times.res[i,3]<-nrow(times%>%filter(SpecialTimesVeg==i,
                                        times_taken_24HR==i))
    times.res[i,4]<-(times.res[i,3]/times.res[i,2])*100
}

colnames(times.res)<-c("Times24HR",
                        "N took 24HR this many times and said vegetarian at least once on a survey", 
                       "N who took 24HR this many times said Veg in every 24HR they took",
                       "PercentVeg")
times.res<-as.data.frame(times.res)
times.res$Times24HR<-as.factor(times.res$Times24HR)

times.res%>%kbl()
Times24HR N took 24HR this many times and said vegetarian at least once on a survey N who took 24HR this many times said Veg in every 24HR they took PercentVeg
1 2781 2781 100.00000
2 2130 1380 64.78873
3 2162 1229 56.84551
4 1672 925 55.32297
5 370 168 45.40541
greens=c("#36E072", "#2EC062", "#27A152", "#1F8141", "#176131")

p3=ggplot(data=times.res, aes(x=Times24HR,
                           y=PercentVeg, fill=Times24HR))+
    geom_bar(stat="identity", width=0.5)+
    ggtitle(paste("N=", nveg,"participants who indicated
being vegetarian/vegan at least once on
a 24HR dietary assessment", sep=" "))+
    xlab("Number of 24 hour recall surveys taken")+
    ylab("Vegetarian/vegan consistently
across all surveys (%)")+
    theme_bw()+
    scale_fill_manual(values=greens, guide="none")

p3

Identify those who were vegetarian at just first 24HR they took

(First 24HR is called “next” in the script, as in next after the initial assessment).

specialVeg=new%>%select(SpecialVeg0:SpecialVeg4)

#Find positions of all NA's
NonNAindex=apply(specialVeg, 1,FUN = function(x) which(!is.na(x)), 
                 simplify = T)

#Take first non-NA value
specialVeg$NonNAindex1=sapply(NonNAindex,"[",1)
head(specialVeg)%>%kbl()
SpecialVeg0 SpecialVeg1 SpecialVeg2 SpecialVeg3 SpecialVeg4 NonNAindex1
NA NA NA NA NA NA
NA NA NA NA NA NA
0 NA NA NA NA 1
NA NA NA NA NA NA
NA NA NA 0 0 4
NA 0 0 NA 0 2
#Find the first time they answered the vegetarianism question:
specialVeg$nextSpecialVeg=0

#this loop takes a long time but couldn't figure out how to do it with vectors
for (i in 1:nrow(specialVeg)){
    if(!is.na(specialVeg$NonNAindex1[i])){
        #If the first non-NA special Veg = 1, set this to 1
        if (specialVeg[i, specialVeg$NonNAindex1[i]]==1)
            {specialVeg$nextSpecialVeg[i]=1}
        #else set to zero
        else {specialVeg$nextSpecialVeg[i]=0}
    }
    #Else set to NA
    else{specialVeg$nextSpecialVeg[i]=NA}
}

#Add to main table
new$nextSpecialVeg=specialVeg$nextSpecialVeg

new%>%filter(SpecialTimesVeg>0)%>%select(SpecialVeg0:SpecialVeg4, nextSpecialVeg)%>%head(20)%>%kbl()%>%row_spec(18, background="orange")
SpecialVeg0 SpecialVeg1 SpecialVeg2 SpecialVeg3 SpecialVeg4 nextSpecialVeg
NA NA NA NA 1 1
NA 1 NA NA 1 1
NA 1 NA NA NA 1
NA 1 1 1 NA 1
NA 1 NA NA NA 1
NA 1 1 1 1 1
NA 1 NA 1 1 1
NA NA NA NA 1 1
1 1 1 0 1 1
NA 1 1 NA 1 1
NA NA NA NA 1 1
NA 1 NA NA 1 1
NA 1 0 1 1 1
1 NA NA NA 1 1
NA 1 1 NA 1 1
NA NA 1 1 NA 1
NA NA NA NA 1 1
0 0 1 NA NA 0
1 0 0 NA NA 1
1 NA NA NA NA 1

Identify those who ate meat or fish in the 24HR

#Category 100106: Meat/fish yesterday. 24HR.
Meatfish24<-bd%>%select(f.103000.0.0, f.103000.1.0, f.103000.2.0, 
                        f.103000.3.0, f.103000.4.0, 
                        f.103140.0.0, f.103140.1.0, f.103140.2.0,
                        f.103140.3.0, f.103140.4.0)


colnames(Meatfish24)<-c("Meat0", "Meat1","Meat2",
                        "Meat3","Meat4",
                        "Fish0","Fish1","Fish2",
                        "Fish3","Fish4")

Meatfish24<-as_tibble(sapply(Meatfish24, 
                         mapvalues, c("No", "Yes"), c(0, 1)))
Meatfish24<-as_tibble(sapply(Meatfish24, as.numeric))
Meatfish24
## # A tibble: 502,413 × 10
##    Meat0 Meat1 Meat2 Meat3 Meat4 Fish0 Fish1 Fish2 Fish3 Fish4
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
##  2    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
##  3     0    NA    NA    NA    NA     0    NA    NA    NA    NA
##  4    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
##  5    NA    NA    NA     1     1    NA    NA    NA     1     0
##  6    NA     0     1    NA     1    NA     0     0    NA     0
##  7    NA    NA     0    NA    NA    NA    NA     0    NA    NA
##  8    NA     1    NA     0    NA    NA     1    NA     1    NA
##  9    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## 10    NA    NA    NA    NA     0    NA    NA    NA    NA     0
## # … with 502,403 more rows
Meatfish24$Meatfish24Sum<-ifelse(
    apply(
        is.na(Meatfish24),1,all),NA,
    rowSums(Meatfish24, na.rm = T)
    )

Meatfish24$IID<-bd$f.eid
new<-left_join(new, Meatfish24)
## Joining, by = "IID"
new$MeatFish24Veg<-NA
new$MeatFish24Veg[new$Meatfish24Sum==0]<-1
table(new$MeatFish24Veg)
## 
##      1   <NA> 
##  18631 483782
meat2=new%>%select(Meat0:Meat4)
fish2=new%>%select(Fish0:Fish4)

#Find positions of all NA's
#NonNAindex strategy same as above, identical for Special, Meat and Fish cols.
meat2$NonNAindex1=sapply(NonNAindex,"[",1)
fish2$NonNAindex1=sapply(NonNAindex,"[",1)


# Just the first 24HR time point -------------------------------------
meat2$nextMeat24=0
#0= didn't eat meat. 1 = ate meat
for (i in 1:nrow(meat2)){
    if(!is.na(meat2$NonNAindex1[i])){
        if (meat2[i, meat2$NonNAindex1[i]]==1){meat2$nextMeat24[i]=1}
        else {meat2$nextMeat24[i]=0}
    }
    else{meat2$nextMeat24[i]=NA}
}

fish2$nextFish24=0
#0= didn't eat fish. 1 = ate fish
for (i in 1:nrow(fish2)){
    if(!is.na(fish2$NonNAindex1[i])){
        if (fish2[i, fish2$NonNAindex1[i]]==1){fish2$nextFish24[i]=1}
        else {fish2$nextFish24[i]=0}
    }
    else{fish2$nextFish24[i]=NA}
}

#Add to main table
new$nextMeat24=meat2$nextMeat24
new$nextFish24=fish2$nextFish24

Plot: those who indicated being a vegetarian/vegan and also indicated meat or fish eating on that same survey

#ate<-dat[c(17:21, 35:44)]
ate=new%>%select(SpecialVeg0:SpecialVeg4, Meat0:Fish4)

#initialize four categories as columns in data frame "ate"
Vegatemeat<-paste("vegAteMeatOnly", 0:4, sep="")
ate[Vegatemeat]<-NA
Vegatefish<-paste("vegAteFishOnly", 0:4, sep="")
ate[Vegatefish]<-NA
Vegatemeatandfish<-paste("vegAteMeatAndFish", 0:4, sep="")
ate[Vegatemeatandfish]<-NA
Vegdidnt<-paste("vegDidnt", 0:4, sep="")
ate[Vegdidnt]<-NA


for (i in 0:4){
    #Loop over instances 0:4 of 24HR survey
    print(paste(sum(ate[paste("SpecialVeg", i, sep="")]==1, na.rm=T), "self-identified veg at tp", i))
    
    ate[paste("vegAteMeatOnly", i, sep="")][
        ate[paste("Meat", i, sep="")]==1 &
        ate[paste("Fish", i, sep="")]==0 &
        ate[paste("SpecialVeg", i, sep="")]==1
        ]=1
    
    ate[paste("vegAteFishOnly", i, sep="")][
        ate[paste("Meat", i, sep="")]==0 &
        ate[paste("Fish", i, sep="")]==1 &
        ate[paste("SpecialVeg", i, sep="")]==1
        ]=1
    
    ate[paste("vegAteMeatAndFish", i, sep="")][
        ate[paste("Meat", i, sep="")]==1 &
            ate[paste("Fish", i, sep="")]==1 &
            ate[paste("SpecialVeg", i, sep="")]==1
    ]=1
    
    ate[paste("vegDidnt", i, sep="")][
        ate[paste("Meat", i, sep="")]==0 &
        ate[paste("Fish", i, sep="")]==0 &
        ate[paste("SpecialVeg", i, sep="")]==1
        ]=1
}
## [1] "2515 self-identified veg at tp 0"
## [1] "3985 self-identified veg at tp 1"
## [1] "3241 self-identified veg at tp 2"
## [1] "4050 self-identified veg at tp 3"
## [1] "3883 self-identified veg at tp 4"
colnames(ate)
##  [1] "SpecialVeg0"        "SpecialVeg1"        "SpecialVeg2"       
##  [4] "SpecialVeg3"        "SpecialVeg4"        "Meat0"             
##  [7] "Meat1"              "Meat2"              "Meat3"             
## [10] "Meat4"              "Fish0"              "Fish1"             
## [13] "Fish2"              "Fish3"              "Fish4"             
## [16] "vegAteMeatOnly0"    "vegAteMeatOnly1"    "vegAteMeatOnly2"   
## [19] "vegAteMeatOnly3"    "vegAteMeatOnly4"    "vegAteFishOnly0"   
## [22] "vegAteFishOnly1"    "vegAteFishOnly2"    "vegAteFishOnly3"   
## [25] "vegAteFishOnly4"    "vegAteMeatAndFish0" "vegAteMeatAndFish1"
## [28] "vegAteMeatAndFish2" "vegAteMeatAndFish3" "vegAteMeatAndFish4"
## [31] "vegDidnt0"          "vegDidnt1"          "vegDidnt2"         
## [34] "vegDidnt3"          "vegDidnt4"
ate=ate[16:35]
ate
## # A tibble: 502,413 × 20
##    vegAteMeatO…¹ vegAt…² vegAt…³ vegAt…⁴ vegAt…⁵ vegAt…⁶ vegAt…⁷ vegAt…⁸ vegAt…⁹
##            <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1            NA      NA      NA      NA      NA      NA      NA      NA      NA
##  2            NA      NA      NA      NA      NA      NA      NA      NA      NA
##  3            NA      NA      NA      NA      NA      NA      NA      NA      NA
##  4            NA      NA      NA      NA      NA      NA      NA      NA      NA
##  5            NA      NA      NA      NA      NA      NA      NA      NA      NA
##  6            NA      NA      NA      NA      NA      NA      NA      NA      NA
##  7            NA      NA      NA      NA      NA      NA      NA      NA      NA
##  8            NA      NA      NA      NA      NA      NA      NA      NA      NA
##  9            NA      NA      NA      NA      NA      NA      NA      NA      NA
## 10            NA      NA      NA      NA      NA      NA      NA      NA      NA
## # … with 502,403 more rows, 11 more variables: vegAteFishOnly4 <dbl>,
## #   vegAteMeatAndFish0 <dbl>, vegAteMeatAndFish1 <dbl>,
## #   vegAteMeatAndFish2 <dbl>, vegAteMeatAndFish3 <dbl>,
## #   vegAteMeatAndFish4 <dbl>, vegDidnt0 <dbl>, vegDidnt1 <dbl>,
## #   vegDidnt2 <dbl>, vegDidnt3 <dbl>, vegDidnt4 <dbl>, and abbreviated variable
## #   names ¹​vegAteMeatOnly0, ²​vegAteMeatOnly1, ³​vegAteMeatOnly2,
## #   ⁴​vegAteMeatOnly3, ⁵​vegAteMeatOnly4, ⁶​vegAteFishOnly0, ⁷​vegAteFishOnly1, …
ate.plot<-ate%>% 
    gather(v, value,vegAteMeatOnly0:vegDidnt4, na.rm = T)%>%
    group_by(v)%>%count(value)
    
as_tibble(ate.plot)
## # A tibble: 20 × 3
##    v                  value     n
##    <chr>              <dbl> <int>
##  1 vegAteFishOnly0        1   214
##  2 vegAteFishOnly1        1   312
##  3 vegAteFishOnly2        1   236
##  4 vegAteFishOnly3        1   255
##  5 vegAteFishOnly4        1   269
##  6 vegAteMeatAndFish0     1    19
##  7 vegAteMeatAndFish1     1    20
##  8 vegAteMeatAndFish2     1    12
##  9 vegAteMeatAndFish3     1    14
## 10 vegAteMeatAndFish4     1    19
## 11 vegAteMeatOnly0        1    76
## 12 vegAteMeatOnly1        1    75
## 13 vegAteMeatOnly2        1    71
## 14 vegAteMeatOnly3        1   100
## 15 vegAteMeatOnly4        1    86
## 16 vegDidnt0              1  2206
## 17 vegDidnt1              1  3578
## 18 vegDidnt2              1  2922
## 19 vegDidnt3              1  3681
## 20 vegDidnt4              1  3509
ate.plot=ate.plot%>%separate(v, into=c("group", "TP"), sep= -1, convert=T)%>%select(-value)
as.data.frame(ate.plot)
##                group TP    n
## 1     vegAteFishOnly  0  214
## 2     vegAteFishOnly  1  312
## 3     vegAteFishOnly  2  236
## 4     vegAteFishOnly  3  255
## 5     vegAteFishOnly  4  269
## 6  vegAteMeatAndFish  0   19
## 7  vegAteMeatAndFish  1   20
## 8  vegAteMeatAndFish  2   12
## 9  vegAteMeatAndFish  3   14
## 10 vegAteMeatAndFish  4   19
## 11    vegAteMeatOnly  0   76
## 12    vegAteMeatOnly  1   75
## 13    vegAteMeatOnly  2   71
## 14    vegAteMeatOnly  3  100
## 15    vegAteMeatOnly  4   86
## 16          vegDidnt  0 2206
## 17          vegDidnt  1 3578
## 18          vegDidnt  2 2922
## 19          vegDidnt  3 3681
## 20          vegDidnt  4 3509
ate.plot$group=mapvalues(ate.plot$group,
                         from=unique(ate.plot$group),
                         to=c("Ate fish", "Ate meat and fish",
                              "Ate meat", "Did not eat meat or fish"))

prop=0
for (i in 0:4){
    y=ate.plot[ate.plot$TP==i,]
    prop[i+1]=sum(y$n[1:3])/y$n[4]
    prop[i+1]=round(prop[i+1]*100, digits = 2)
}
 
ate.plot=ate.plot%>%
       mutate_when(group=="Did not eat meat or fish", list(value=prop))

plot=ggplot(data=ate.plot, aes(x=TP, y=n, fill=group)) +
    geom_bar(position="stack", stat="identity", width=0.5)+
    theme_bw()+
    ggtitle("Self-identified 'vegetarian/vegan' who indicated meat or fish eating 
on the same 24HR survey instance")+
    scale_fill_manual(values=c("blue","red", 
                        "green", "grey"))+
    xlab("Instance of 24 hour recall")+
    ylim(0,4200)+
    geom_text(aes(label = ifelse(is.na(value), "",
                    paste(value, "%",sep="")), vjust=-3))+
    labs(fill="Vegetarian/vegan who:")+
    ylab("Number of participants")
plot

#write plot to tiff
# tiff(filename="veg.atemeatorfish.tiff", width=6, height=4, units='in', res=600)
# plot
# dev.off()

Create new vegetarian columns

Veg5yr1 is the vegetarian variable used in the main analysis. Again the criteria are:

  • Self-identified as a vegetarian or vegan the first time they were asked on a 24HR survey

  • Didn’t indicate eating any meat or fish on that same 24HR survey

  • Indicated no major dietary changes over past 5 years

  • No meat or fish eating indicated on initial assessment

new$Veg5yr1<-0
new$Veg5yr1[new$DietChanges5yr==0 & 
             new$IAveg==TRUE & 
             new$nextSpecialVeg==1 &
             new$nextMeat24==0 & 
             new$nextFish24==0]=1
new$Veg5yr1[is.na(new$times_taken_24HR)]=NA

table(new$Veg5yr1) #3207
## 
##      0      1   <NA> 
## 207762   3205 291446

Table: reason self-identified vegetarian was disqualified from study

dq=new%>%select(Veg5yr1, DietChanges5yr, IAveg, nextSpecialVeg, nextMeat24, nextFish24, SpecialTimesVeg)%>%filter(SpecialTimesVeg>=1)
nrow(dq) #people who said they were vegetarian at least once
## [1] 9115
dq$nextMeatFish24=0
dq$nextMeatFish24[dq$nextMeat24==1 | dq$nextFish24==1]=1

dq1=as.data.frame(dq%>%group_by(nextSpecialVeg, nextMeatFish24,IAveg, DietChanges5yr)%>%summarize(n()))
## `summarise()` has grouped output by 'nextSpecialVeg', 'nextMeatFish24', 'IAveg'. You can
## override using the `.groups` argument.
dq1$nextSpecialVeg=mapvalues(dq1$nextSpecialVeg, from=c(0,1),
                             to=c("No", "Yes"))
dq1$IAveg=mapvalues(dq1$IAveg, from=c("TRUE", "FALSE"),
                    to=c("No", "Yes")) #reversing the point of this column from "were they veg" to "did they eat meat"

dq1$nextMeatFish24=mapvalues(dq1$nextMeatFish24, from=c(0,1),
                         to=c("No", "Yes"))

dq1$DietChanges5yr=mapvalues(dq1$DietChanges5yr, from=c(0,1),
                         to=c("No", "Yes"))

colnames(dq1)=c("Veg. on first 24HR taken",
                "Ate meat/fish on first 24HR taken",
                "Ate meat/fish on initial assessment",
                "Major dietary changes past 5 years",
                "N")

novegfirst=unlist(dq1%>%filter(`Veg. on first 24HR taken`=="No")%>%summarize(sum(N)))

#https://haozhu233.github.io/kableExtra/awesome_table_in_html.html

dq1%>%filter(`Veg. on first 24HR taken`=="Yes")%>%
    add_row(`Veg. on first 24HR taken`="No", 
            `Ate meat/fish on first 24HR taken`="", 
            `Ate meat/fish on initial assessment`="", 
            `Major dietary changes past 5 years`="",
            N=novegfirst)%>%
    arrange(desc(`Veg. on first 24HR taken`),
            `Ate meat/fish on initial assessment`)%>%
    kbl(align="c", caption=paste("Survey results for", nrow(dq),"participants who self-identified as vegetarians at least once", sep=" ")) %>%
  kable_classic_2(full_width = F)%>%
    row_spec(1, background="#36E072")
Survey results for 9115 participants who self-identified as vegetarians at least once
Veg. on first 24HR taken Ate meat/fish on first 24HR taken Ate meat/fish on initial assessment Major dietary changes past 5 years N
Yes No No No 3205
Yes No No Yes 1136
Yes Yes No No 11
Yes Yes No Yes 11
Yes No Yes No 1628
Yes No Yes Yes 932
Yes Yes Yes No 490
Yes Yes Yes Yes 369
Yes No NA NA 6
No 1327

Write output files.

# Write output ------------------------------------------
write.csv(new, "DefineVeg-09062022.csv", row.names=F, quote=F)
vegonly=new%>%filter(SpecialPropVeg>0)
write.csv(vegonly, "Vegonly-09062022.csv", row.names=F, quote=F)

  1. This seemed unnecessarily complicated, so I verified with UKB and they confirmed:

    “If a day is recorded in field 20080-x.0, but a null value is given for field 20086-x.0, then it means that the participant did not follow any of the six possible special diets during that period. Each diet that they did follow is recorded sequentially in fields 20086-x.0 through 20086-x.5 (i.e. the last is only populated if all diets were selected). No entry in field 20080-x.0 indicates the participant did not complete instance x of the dietary questionnaire (and all of 20086-x.0 through 20086-x.5 will be empty).”

    –Sean Watson, UK Biobank data analyst, 8/31/2022↩︎