Cleaning the genotype data for the Attie project ================================================ Karl W Broman <kbroman@biostat.wisc.edu> 25 May 2011 :toc2: :numbered: :data-uri: //////////////////////////////////////////// Genotype data: Data/genotypes4rqtl.csv Map: Maps/finalmap.csv Raw genotype information: Data/detailed_genotypes.csv //////////////////////////////////////////// == Load the data == First we load the R/qtl package and print the version we're using. <<loadlib>>= library(qtl) qtlversion() @ <<loadlibrary,echo=FALSE,results=hide>>= setCacheDir("Rcache") options(width=132, digits=3, scipen=4) set.seed(62896949) @ We load the raw genotype data, which includes data on the parental strains and F~1~. We drop the control mouse with a blank for ``strain.'' <<loaddata,cache=TRUE>>= rawg <- read.cross("csv", "Data", "genotypes4rqtl.csv", genotypes=0:2, convertXdata=FALSE, alleles=c("B","R")) rawg <- subset(rawg, ind=(rawg$pheno$Strain!="")) @ The mouse with missing strain also had missing sex, we clean up the sex and strain phenotypes as follows: <<fixsex>>= rawg$pheno$Sex <- factor(as.character(rawg$pheno$Sex), levels=c("Female", "Male")) rawg$pheno$Strain <- factor(as.character(rawg$pheno$Strain), levels=unique(as.character(rawg$pheno$Strain))) @ A quick summary: <<summaryrawg>>= summary(rawg) @ Note that the chromosome +"un"+ is a group of \Sexpr{nmar(rawg)["un"]} markers that I couldn't find in the mouse genome build 37. == Missing data pattern == We first look at the number of missing genotypes at each mouse and each marker. <<histnmissing, fig=TRUE, height=10>>= par(mfrow=c(2,1), las=1) hist(nmissing(rawg), main="No. missing genotypes by mouse", xlab="No. missing genotypes", breaks=140) rug(nmissing(rawg), col="blue") hist(nmissing(rawg, what="mar"), main="No. missing genotypes by marker", xlab="No. missing genotypes", breaks=50) rug(nmissing(rawg, what="mar"), col="blue") @ There are three mice with no genotype data. <<nodata>>= rawg$pheno[ntyped(rawg)==0,] @ I might as well drop these. <<dropnodata>>= rawg <- subset(rawg, ind=(ntyped(rawg)>0)) @ We should look at the histogram of the number of missing genotypes per mouse again. <<histnmissingagain, fig=TRUE>>= par(las=1) hist(nmissing(rawg), main="No. genotypes by mouse", xlab="No. markers genotyped", breaks=140) rug(nmissing(rawg), col="blue") @ There remain a number of mice with a good amount of missing data, which might suggest poor-quality DNAs or other problems with the genotyping arrays. There are \Sexpr{sum(nmissing(rawg) > 0.05*totmar(rawg))} mice missing genotypes at more than 5% of the markers (that is, at more than \Sexpr{floor(0.05*totmar(rawg))} of the \Sexpr{totmar(rawg)} markers). There are also a number of markers with elevated rates of missing data. There are \Sexpr{sum(nmissing(rawg, "mar") > nind(rawg)*0.05)} markers with more than 5% missing data (that is, at more than \Sexpr{floor(0.05*nind(rawg))} of the \Sexpr{nind(rawg)} mice). But none of these seem terrible. == Study X chromosome genotypes == Let's start by looking at the X chromosome genotypes. The cross was (BTBR x B6) x (BTBR x B6), with females listed first, and so the F~1~ males should all be hemizygous B6 or BTBR while the F~1~ females should be heterozygous. The F~2~ males should be hemizygous B6 or BTBR and the F~2~ females should be heterozygous or homozygous BTBR. We pull out the X chromosome genotypes and the ``strain'' and sex for each mouse. <<pullxchrgeno>>= gx <- pull.geno(rawg, chr="X") dim(gx) strain <- rawg$pheno$Strain sex <- rawg$pheno$Sex @ === Not segregating === First, let's look at which markers seem to segregate in the F~2~. We create a table of the three possible genotypes (1, 2, 3) and look at the number of mice with the least-frequent genotype. Since there are \Sexpr{sum(strain=="F2" & sex=="Male")} F~2~ males and \Sexpr{sum(strain=="F2" & sex=="Female")} F~2~ females, all markers should have an appreciable number of mice with each of the three genotypes. <<segregateX>>= ming <- apply(apply(gx[strain=="F2",], 2, function(a) table(factor(a, levels=1:3))), 2, min) table(ming) @ There are \Sexpr{sum(ming < 120)} markers that appear to not be segregating. Let's go ahead and omit those. We'll create a new cross object +cleang+ that will contain the cleaned genotypes. <<drop_notseg_X>>= todrop <- names(ming[ming<120]) cleang <- drop.markers(rawg, todrop) gx <- pull.geno(cleang, chr="X") dim(gx) @ We have just \Sexpr{nmar(cleang)["X"]} markers left on the X chromosome. === Genotypes in B6 and BTBR === Let's first look at the B6 mice. There are \Sexpr{sum(strain=="B6")} such; while \Sexpr{sum(strain=="B6" & sex=="Male")} are male and \Sexpr{sum(strain=="B6" & sex=="Female")} is female, they should all be homozygous for the same allele. <<b6Xchr>>= all(apply(gx[strain=="B6",], 2, function(a) length(unique(a[!is.na(a)]))) == 1) b6xg <- apply(gx[strain=="B6",], 2, function(a) unique(a[!is.na(a)])) @ All of the B6 mice are homozygous for the same allele, on the X chromosome. We repeat this for the \Sexpr{sum(strain=="BTBR")} BTBR mice, of which \Sexpr{sum(strain=="BTBR" & sex=="Male")} are male and \Sexpr{sum(strain=="BTBR" & sex=="Female")} are female. <<btbrXchr>>= all(apply(gx[strain=="BTBR",], 2, function(a) length(unique(a[!is.na(a)])) == 1)) btbrxg <- apply(gx[strain=="BTBR",], 2, function(a) unique(a[!is.na(a)])) @ All of the BTBR mice are homozygous for the same allele, on the X chromosome. We further check that the B6 and BTBR genotypes are really _homozygous_ and for different alleles at these \Sexpr{nmar(cleang)["X"]} markers. <<b6notbtbrX>>= all((b6xg == 1 & btbrxg==3) | (b6xg == 3 & btbrxg==1)) @ Let's swap the genotype codes for the markers where the B6 mice have the +3+ genotype (to make it so that +1+ is homozygous B6 and +3+ is homozygous BTBR). <<swapXgenotypes>>= toswap <- names(b6xg)[b6xg==3] for(mar in toswap) cleang$geno[["X"]]$data[,mar] <- 4 - cleang$geno[["X"]]$data[,mar] gx <- pull.geno(cleang, chr="X") @ === Genotypes in F~1~ === Let's look at the \Sexpr{sum(strain=="F1")} F~1~ mice, of which \Sexpr{sum(strain=="F1" & sex=="Male")} are male and \Sexpr{sum(strain=="F1" & sex=="Female")} are female. The males should all by homozygous BTBR and the females should all by heterozygous. <<checkF1X>>= f1male.gx <- gx[strain=="F1" & sex=="Male",] f1female.gx <- gx[strain=="F1" & sex=="Female",] all(is.na(f1male.gx) | f1male.gx==3) all(is.na(f1female.gx) | f1female.gx==2) @ Everything looks good. === Genotypes in F~2~ males === We now turn to the F~2~ males. They should all be hemizgous B6 (+1+) or BTBR (+3+). <<checkF2maleX>>= f2male.gx <- gx[strain=="F2" & sex=="Male",] sum(!is.na(f2male.gx) & f2male.gx==2) @ There are \Sexpr{sum(!is.na(f2male.gx) & f2male.gx==2)} heterozygous genotypes in the \Sexpr{sum(strain=="F2" & sex=="Male")} male mice. Let's look at the males that have heterozygous calls. <<F2maleXhet>>= wh <- apply(f2male.gx, 1, function(a) any(!is.na(a) & a==2)) sum(wh) tab <- apply(f2male.gx[wh,], 1, function(a) table(factor(a[!is.na(a)], levels=1:3))) tab[,order(tab[2,])] @ The first \Sexpr{sum(tab[2,]==1)} of these look to be male but with a single genotyping error. The other \Sexpr{sum(tab[2,]>1)} look to be really female. Let's save the IDs for these mice. <<notmale>>= malef2id <- as.character(cleang$pheno$MouseNum)[strain=="F2" & sex=="Male"] notmale <- malef2id[apply(f2male.gx, 1, function(a) sum(!is.na(a) & a==2)>1)] @ Note that there are additional male mice that might really be females; if all of their genotypes on the X chromosome are homozygous/hemizygous BTBR, we can't tell. <<notsuremale>>= sum(apply(f2male.gx, 1, function(a) all(is.na(a) | a==3))) @ === Genotypes in F~2~ females === We turn to the F~2~ females. They should all be heterozygous (+2+) or homozygous BTBR (+3+). <<checkF2femaleX>>= f2female.gx <- gx[strain=="F2" & sex=="Female",] sum(!is.na(f2female.gx) & f2female.gx==1) @ There are \Sexpr{sum(!is.na(f2female.gx) & f2female.gx==1)} homozygous B6 genotypes in the \Sexpr{sum(strain=="F2" & sex=="Female")} female mice. Let's look at the females that have homozygous B6 calls. <<F2femaleXhet>>= wh <- apply(f2female.gx, 1, function(a) any(!is.na(a) & a==1)) sum(wh) tab <- apply(f2female.gx[wh,], 1, function(a) table(factor(a[!is.na(a)], levels=1:3))) tab[,order(tab[1,])] @ These all look to be clearly males. Let's save the IDs for these mice. <<notfemale>>= femalef2id <- as.character(cleang$pheno$MouseNum)[strain=="F2" & sex=="Female"] notfemale <- femalef2id[apply(f2female.gx, 1, function(a) sum(!is.na(a) & a==1)>1)] @ <<savethesexswaps,echo=FALSE>>= save(notmale, notfemale, file="Data/sex_swaps.RData") @ Note that there are additional female mice that might really be males; if all of their genotypes on the X chromosome are homozygous/hemizygous BTBR, we can't tell. <<notsurefemale>>= sum(apply(f2female.gx, 1, function(a) all(is.na(a) | a==3))) @ === Swap sexes === Let's go ahead and swap the sexes of these mice whose X chromosome genotypes indicate they are the opposite sex. We'll change the phenotype +Sex+ to +SexID+ and make a new ``phenotype'' +Sex+. <<swapsex>>= names(cleang$pheno)[names(cleang$pheno)=="Sex"] <- "SexID" cleang$pheno$Sex <- cleang$pheno$SexID cleang$pheno$Sex[match(notmale, cleang$pheno$MouseNum)] <- "Female" cleang$pheno$Sex[match(notfemale, cleang$pheno$MouseNum)] <- "Male" table(cleang$pheno$SexID, cleang$pheno$Sex) @ I'll re-create my +sex+ vector to match the new values. <<make_sex>>= sex <- cleang$pheno$Sex @ === Re-code genotypes === Now let's recode the X chromosome genotypes. F~2~ females should have genotype codes +1+ for homozygous BTBR and +2+ for heterozygous. F~2~ males should have genotype codes +1+ for homozygous B6 and +2+ for homozygous BTBR. First, we handle the males. <<recode_male_Xgenotypes>>= gx <- pull.geno(cleang, chr="X") whmale <- strain=="F2" & sex=="Male" gxmale <- gx[whmale,] sum(!is.na(gxmale) & gxmale==2) gxmale[!is.na(gxmale) & gxmale==2] <- NA gxmale[!is.na(gxmale) & gxmale==3] <- 2 gx[whmale,] <- gxmale @ Now, we handle the females. <<recode_female_Xgenotypes>>= whfemale <- strain=="F2" & sex=="Female" gxfemale <- gx[whfemale,] sum(!is.na(gxfemale) & gxfemale==1) gxfemale[!is.na(gxfemale) & gxfemale==1] <- NA gxfemale[!is.na(gxfemale) & gxfemale==3] <- 1 gx[whfemale,] <- gxfemale @ Now we paste the genotypes back into the cross object. <<paste_genotypes>>= cleang$geno[["X"]]$data <- gx @ Let's further add a +pgm+ phenotype, indicating cross direction, which will be 1 for all F~2~ mice. <<addpgm>>= cleang$pheno$pgm <- rep(NA, nind(cleang)) cleang$pheno$pgm[strain=="F2"] <- 1 @ The summary of the cross (with just the F~2~ mice) should be clean. (But then, the warnings don't get printed here anyway!) <<summaryagain>>= summary(subset(cleang, ind=(strain=="F2"))) @ === Segregation patterns === Let's look at the segregation of the genotypes on the X chromosome. First, let's look at the output of geno.table. What we're looking for here are departures from 1:1 segregation that might indicate markers with high rates of genotyping errors. <<Xsegregation>>= geno.table(subset(cleang, ind=(strain=="F2")), chr="X") @ Everything looks okay. Let's make a couple of plots, of genotype frequencies along the X chromosome for each sex. <<plotXsegregation, fig=TRUE, height=8>>= par(mfrow=c(2,1), las=1) f2male <- subset(cleang, ind=(strain=="F2" & sex=="Male")) plot(geno.table(f2male, chr="X", scanone=TRUE), lod=8:9, main="X chr genotype frequencies; F2 males", ylab="Genotype frequency", col=c("blue","red"), ylim=c(0.3, 0.7)) abline(h=0.5, lty=2, col="gray") legend("topleft", lwd=2, col=c("blue","red"), legend=c("hem B6", "hem BTBR")) f2female <- subset(cleang, ind=(strain=="F2" & sex=="Female")) plot(geno.table(f2female, chr="X", scanone=TRUE), lod=4:5, main="X chr genotype frequencies; F2 females", ylab="Genotype frequency", col=c("green", "red"), ylim=c(0.3, 0.7)) abline(h=0.5, lty=2, col="gray") legend("topleft", lwd=2, col=c("green","red"), legend=c("het", "hom BTBR")) @ The departures from 1:1 segregation seem not too worrisome. === Counts of crossovers === Let's look at the number of crossovers in the F~2~ mice on the X chromosome. <<countXO_onX>>= nxo <- countXO(subset(cleang, ind=(strain=="F2")), chr="X") table(nxo) @ There are \Sexpr{sum(nxo>2)} mice with >2 crossovers. Let's plot the genotype data for these mice. We run +calc.errorlod+ to avoid a warning message. (The genotyping error LOD scores are used in +plot.geno+ to flag potential genotyping errors.) <<plotXgeno, fig=TRUE>>= temp <- subset(cleang, ind=(strain=="F2")) temp <- subset(temp, ind=(nxo>2), chr="X") temp <- calc.errorlod(temp, error.prob=0.002, map.function="c-f") plot.geno(temp, chr="X") @ These look unusual, but there are no obvious problems. === Re-estimate map === Finally, let's re-estimate the genetic map for the X chromosome and see how it differs from the map from Cox et al. (_Genetics,_ 182:1335-1344, 2009). I'll use a genotype error rate of 2/1000, and the Carter-Falconer map function (which corresponds, approximately, to the high level of crossover interference in the mouse). <<estmap>>= f2g <- subset(cleang, ind=(strain=="F2"), chr="X") newmap <- est.map(f2g, err=0.002, map.function="c-f") @ Let's look at summaries of the original and the newly estimated maps. <<summarymap>>= summary.map(f2g)[1,,drop=FALSE] summary.map(newmap)[1,,drop=FALSE] @ Overall, the map hasn't changed much. Let's plot the map next to the one embedded in the cross (which corresponds to that of Cox et al. 2009). The map on the bottom is the newly estimated one <<plotmap, fig=TRUE>>= plot.map(f2g, newmap, horizontal=TRUE) @ It sure looks similar. It might be better to study the individual interval lengths. <<plotintervals, fig=TRUE>>= dom <- diff(pull.map(f2g)[[1]]) dnm <- diff(newmap[[1]]) par(las=1) plot(dom, dnm-dom, type="n", xlab="Original interval length (cM)", ylab="Change in interval length (cM)") abline(h=0, lty=2, col="gray") text(dom, dnm-dom, seq(along=dnm)) @ Note that the numbers indicate the intervals. Some of the smaller intervals have changed a great deal in length, but overall it looks pretty good. == Study autosomal genotypes == We now turn to the autosomal genotypes. <<pullautosomalgeno>>= ga <- pull.geno(cleang, chr="-X") dim(ga) @ === Not segregating === First, let's look at which markers seem to segregate in the F~2~. We create a table of the three possible genotypes (1, 2, 3) and look at the proportion of mice with the least-frequent genotype. Since there are \Sexpr{ncol(ga)} markers, we'll look at a histogram. <<segregateA,fig=TRUE,cache=TRUE>>= ming <- apply(apply(ga[strain=="F2",], 2, function(a) table(factor(a, levels=1:3))/sum(!is.na(a))), 2, min) par(las=1) hist(ming, breaks=seq(0, ceiling(max(ming)*100)/100, by=0.005), xlab="Frequency of least-frequent genotype", main="") rug(ming, col="blue") @ Note the big gap between \Sexpr{round(max(ming[ming< 0.1])*100)}% and \Sexpr{round(min(ming[ming>0.1])*100)}%. There are \Sexpr{sum(ming < 0.1)} markers that appear not to be segregating, as the least-frequent genotype is present in <10% of mice. There are \Sexpr{sum(ming > 0.1)} markers that do appear to be segregating, as the least-frequent genotype is present in >10% of mice. Let's go ahead and omit the non-segregating markers. <<drop_notseg_A>>= todrop <- names(ming[ming<0.1]) cleang <- drop.markers(cleang, todrop) ga <- pull.geno(cleang, chr="-X") dim(ga) @ === Genotypes in B6 and BTBR === Let's look at the B6 mice. There are \Sexpr{sum(strain=="B6")} such; they should all be homozygous for the same allele. <<b6Achr>>= bad.b6 <- apply(ga[strain=="B6",], 2, function(a) (length(unique(a[!is.na(a)]))) != 1 || all(is.na(a) | a==2)) sum(bad.b6) ga[strain=="B6",bad.b6] @ There are \Sexpr{sum(bad.b6)} markers with heterozygous calls, including one with no homozygous calls. Let's grab the inferred genotype (+1+, +3+, or +NA+) for B6 mice. <<b6Agenotype>>= b6ga <- apply(ga[strain=="B6",], 2, function(a) if(all(is.na(a) | a==2)) return(NA) else return(unique(a[!is.na(a) & a!=2]))) sum(is.na(b6ga)) table(b6ga) @ We repeat this for the \Sexpr{sum(strain=="BTBR")} BTBR mice. <<btbrAchr>>= bad.btbr <- apply(ga[strain=="BTBR",], 2, function(a) (length(unique(a[!is.na(a)]))) != 1 || all(is.na(a) | a==2)) sum(bad.btbr) ga[strain=="BTBR",bad.btbr] @ There are \Sexpr{sum(bad.btbr)} markers with heterozygous calls. Again, we pull out the inferred genotype for BTBR mice. <<btbrAgenotype>>= btbrga <- apply(ga[strain=="BTBR",], 2, function(a) if(all(is.na(a) | a==2)) return(NA) else return(unique(a[!is.na(a) & a!=2]))) sum(is.na(btbrga)) table(btbrga) @ We further check that the B6 and BTBR genotypes are homozygous for different alleles at these \Sexpr{sum(nmar(cleang)[-20])} markers. <<b6notbtbrA>>= all(is.na(b6ga) | (b6ga==1 & btbrga==3) | (b6ga==3 & btbrga==1)) @ Let's swap the genotype codes for the markers where the BTBR mice have the +1+ genotype (to make it so that +1+ is homozygous B6 and +3+ is homozygous BTBR). <<swapAgenotypes>>= toswap <- names(btbrga)[btbrga==1] for(chr in names(cleang$geno)[-20]) { thisswap <- toswap[!is.na(match(toswap, markernames(cleang, chr=chr)))] cleang$geno[[chr]]$data[,thisswap] <- 4 - cleang$geno[[chr]]$data[,thisswap] } ga <- pull.geno(cleang, chr="-X") @ === Genotypes in F~1~ === Let's look at the \Sexpr{sum(strain=="F1")} F~1~ mice. They should all be heterozygous. <<checkF1A>>= f1ga <- ga[strain=="F1",] sum(!is.na(f1ga) & f1ga != 2) f1ga[,apply(f1ga, 2, function(a) !all(is.na(a) | a==2))] @ There are 18 markers with a homozygous F~1~ mouse, but at each of these markers there is just one such mouse. === Segregation in F~2~ === We now turn to the segregation patterns of these autosomal markers in the F~2~ mice. As in our study of the X-linked markers, we are looking for markers whose aberrant segregation indicates a high genotyping error rate. We use +geno.table+ to inspect the segregation patterns; the p-values are for tests of 1:2:1 segregation. We do the calculation for all markers, and then pull out the rows that show significant departure at a 5% level, after a Bonferroni correction for the multiple tests. <<genotype_table>>= gt <- geno.table(subset(cleang, chr="-X", ind=(strain=="F2"))) bad <- gt$P.value < 0.05/nrow(gt) gt[bad,] @ The markers on chromosome 6 show real segregation distortion rather than genotyping errors, due to the study design (that's where leptin sits). The other \Sexpr{sum(bad & gt$chr!=6)} markers are probably error-prone. (The two markers on chromosome 4 are not near each other.) It's probably best to remove them. <<removebadmarkers>>= cleang <- drop.markers(cleang, rownames(gt)[bad & gt$chr != 6]) @ Let's look at plots of the segregation patterns. First, we re-run +geno.table+ with the argument +scanone=TRUE+ (more formally, +scanone.output=TRUE+), so that the output is as that from a genome scan. <<rerun_genotype_table,cache=TRUE>>= gt <- geno.table(subset(cleang, chr="-X", ind=(strain=="F2")), scanone=TRUE) @ Now let's plot the -log (base 10) p-values and the genotype frequencies, for each chromosome. For the p-values, I'll use a different y-axis for chromosome 6 than for the others. [[plot_segregationA_top]] This is a _very long_ set of figures; <<plot_segregationA_bottom,click here>> to jump to below them. <<junk,echo=FALSE>>= Somehow, this avoids the "figure margins too large" error par(mar=rep(0,4)) @ <<plotsegregationA,fig=TRUE,height=64>>= par(mfrow=c(20,2)) ym1a <- max(gt$neglog10P) ym1b <- max(gt$neglog10P[gt$chr != 6]) ym2 <- max(gt$BR) for(i in c(1:19,"un")){ plot(gt, chr=i, main=paste("Chr", i), ylim=c(0,ifelse(i==6,ym1a,ym1b))) plot(gt, chr=i, lod=3:5, main=paste("Chr", i), ylab="Genotype frequency", ylim=c(0, ym2), col=c("green","blue","red")) legend("bottomleft", lwd=2, col=c("green","blue","red"), c("BB","BR","RR")) } @ [[plot_segregationA_bottom]] (<<plot_segregationA_top,Click here>> to jump back to the top of those figures.) There are lots of little spikes in these figures (e.g., chromosome 13 at \Sexpr{myround(gt$pos[gt$chr==13 & gt[,3]>4], 1)} cM): single markers with somewhat aberrant segregation patterns relative to the surrounding markers. These likely correspond to a small proportion of genotyping errors. The bad marker on chromosome 13 is quite clearly indicated to be due to a large batch of double-crossovers (indicating likely genotyping errors). To show that, we look at two-locus genotype tables for the problem marker and the adjacent markers to the left and right. <<chr13issue>>= f2g <- subset(cleang, ind=(strain=="F2")) themar <- rownames(gt)[gt$chr==13 & gt[,3]>4] c13mar <- markernames(cleang, chr=13) themarnum <- which(c13mar == themar) c13mar[themarnum+c(-1,0,1)] geno.crosstab(f2g, c13mar[themarnum+c(-1,1)]) geno.crosstab(f2g, c13mar[themarnum+c(-1,0)]) geno.crosstab(f2g, c13mar[themarnum+c(0,1)]) @ As seen in the above tables, the markers to the left and right of \Sexpr{themar} show just one recombinant, but there are about 30 double-recombination events. This marker appears to have a genotyping error rate of about 5%. It is interesting to note that it also has 5% missing data. Probably I should look at the quality scores for this and other markers, and perhaps also the intensities from the genotyping arrays. For now, it is probably okay to leave this and similar markers in the data, as if we run +calc.genoprob+ with +error.prob=0.002+, the errors will be smoothed over. We can check that, for this marker, by looking at the two-locus genotype tables after imputing in the genotypes with +fill.geno+. <<chr13imputed>>= f2gi <- fill.geno(f2g, error.prob=0.002, map.function="c-f", method="argmax") geno.crosstab(f2gi, c13mar[themarnum+c(-1,1)]) geno.crosstab(f2gi, c13mar[themarnum+c(-1,0)]) geno.crosstab(f2gi, c13mar[themarnum+c(0,1)]) @ As seen in these tables, in the imputed data, the genotypes that led to double-crossovers were deemed to be errors. === Linkage to other chromosomes === We next study whether the markers are all linked to their respective chromosomes and not to other chromosomes. We do so by first considering each pair of markers and calculating the estimated recombination fractions and LOD scores indicating evidence for linkage. This is accomplished with +est.rf+. The calculation can take quite a long time, since with have a total of \Sexpr{totmar(subset(cleang, chr="-X"))} autosomal markers (including the \Sexpr{nmar(cleang)["un"]} markers on chromosome +"un"+). We use +pull.rf+ to pull out matrices of the LOD scores and estimated recombination fractions. <<estrf,cache=TRUE>>= temp <- est.rf(f2g) lod <- pull.rf(temp, "lod") rf <- pull.rf(temp, "rf") @ We now calculate the maximum LOD score between a marker another on its chromosome, and then the maximum LOD score between a marker and another on some other chromosome. <<getmaxlod>>= start <- cumsum(c(1, nmar(f2g))) end <- cumsum(nmar(f2g)) maxlodsame <- maxloddiff <- rep(NA, sum(nmar(f2g))) names(maxlodsame) <- names(maxloddiff) <- markernames(f2g) for(i in seq(along=end)) { maxlodsame[start[i]:end[i]] <- apply(lod[start[i]:end[i],start[i]:end[i]],1,max, na.rm=TRUE) maxloddiff[start[i]:end[i]] <- apply(lod[start[i]:end[i],-c(start[i]:end[i],start[21]:end[21])],1,max, na.rm=TRUE) } max(maxloddiff[-(start[21]:end[21])]) @ For unlinked marker pairs (ignoring the markers on chromosome +"un"+), the biggest LOD score is \Sexpr{myround(max(maxloddiff[-(start[21]:end[21])]), 2)}. So everything looks clean. The three markers on the +"un"+ chromosome map quite cleanly to chromosomes 7 and 8, but since we don't have reliable build 37 positions for them, it's probably best to just ignore them. Let me clarify that a bit. Let's find, for each of the markers on chromosome +"un"+, the marker with the strongest evidence for linkage. <<linkedtoun>>= unmar <- markernames(f2g, chr="un") linkedmar <- unmar for(i in 1:3) linkedmar[i] <- colnames(lod)[!is.na(lod[,end[20]+i]) & lod[,end[20]+i]==max(lod[,end[20]+i], na.rm=TRUE)] find.markerpos(f2g, linkedmar) @ Now let's plot the LOD score for each of those markers against each other marker in the genome. <<plotlodun,fig=TRUE,height=9>>= unmar <- markernames(f2g, chr="un") par(mfrow=c(3,1)) for(i in unmar) qtl:::plot.rfmatrix(lod, i, chr="-un", ylim=c(0,260), main=i) @ === Counts of crossovers === Let's look again at the observed number of crossovers in each individual. We'll first consider just the autosomes (and ignore the +"un"+ chromosome). These will be greatly affected by apparent genotyping errors, and so we'll do this both with the ``raw'' data as well as with imputed genotypes. <<countXO_autosomes>>= nxo.raw <- countXO(f2g, chr=c("-X", "-un")) nxo.clean <- countXO(fill.geno(f2g, method="argmax", error.prob=0.002, map.function="c-f"), chr=c("-X", "-un")) sort(nxo.clean, decreasing=TRUE)[1:10] @ There are a handful of individuals with a large number of crossovers. Let's look at scatter plots of the raw counts against those from the imputed data. <<plotcountXO,fig=TRUE>>= par(mfrow=c(1,2), las=1) plot(nxo.clean, nxo.raw, xlab="No. crossovers (imputed)", ylab="No. crossovers (raw data)") plot(nxo.clean, nxo.raw, xlab="No. crossovers (imputed)", ylab="No. crossovers (raw data)", xlim=c(0,60), ylim=c(0,340)) @ === Apparent genotyping errors === Let's look more closely at the apparent genotyping errors. We'll look at all of the markers except those in the +"un"+ group, and we'll compare the observed genotypes to the inferred genotypes allowing a small amount of genotyping error (using +fill.geno+ with +method="argmax"+). <<fill_genotypes>>= gi <- pull.geno(f2gi, chr="-un") g <- pull.geno(f2g, chr="-un") rownames(g) <- rownames(gi) <- as.character(f2g$pheno$MouseNum) @ Now let's count the number of apparent errors (that is, where the observed genotype doesn't match the imputed genotype) by marker and by mouse. <<count_errors>>= err.by.mar <- apply((!is.na(g) & gi != g), 2, function(a) sum(a)) err.by.mar.prop <- err.by.mar/nind(f2g) err.by.ind <- apply((!is.na(g) & gi != g), 1, function(a) sum(a)) tot.mar <- totmar(subset(f2g, chr="-un")) err.by.ind.prop <- err.by.ind/tot.mar @ Now let's plot histograms of the proportion of errors, by marker and by mouse. <<hist_prop_errors, fig=TRUE, height=8>>= par(las=1, mfrow=c(2,1)) hist(err.by.mar.prop, breaks=seq(-0.25, max(err.by.mar)+ 0.25, by=0.5)/nind(f2g), main="Errors by marker", xlab="Proportion of apparent genotyping errors") rug(err.by.mar.prop, col="blue") u <- par("usr") text(mean(u[1:2]), mean(u[3:4]), paste(sum(err.by.mar.prop > 0.02), "markers with > 2% errors"), col="red") text(mean(u[1:2]), mean(u[3:4])*0.7, paste(sum(err.by.mar.prop < 0.01), "markers with < 1% errors"), col="blue") hist(err.by.ind.prop, breaks=seq(-1, max(err.by.ind)+ 1, by=2)/tot.mar, main="Errors by mouse", xlab="Proportion of apparent genotyping errors") rug(err.by.ind.prop, col="blue") u <- par("usr") text(mean(u[1:2]), mean(u[3:4]), paste(sum(err.by.ind.prop > 0.05), "mice with > 5% errors"), col="red") @ Let's now look at the proportion of genotyping errors by marker, after we omit the \Sexpr{sum(err.by.ind.prop > 0.05)} mice with error rates > 5%. <<error_by_mar_again,fig=TRUE>>= err.by.mar.dropbadmice <- apply((!is.na(g) & gi != g)[err.by.ind.prop < 0.05,], 2, function(a) sum(a)) err.by.mar.dropbadmice.prop <- err.by.mar.dropbadmice/sum(err.by.ind.prop < 0.05) par(las=1) hist(err.by.mar.dropbadmice.prop, breaks=seq(-0.25, max(err.by.mar.dropbadmice)+ 0.25, by=0.5)/sum(err.by.ind.prop < 0.05), main="Errors by marker, dropping bad mice", xlab="Proportion of apparent genotyping errors") rug(err.by.mar.dropbadmice.prop, col="blue") u <- par("usr") text(mean(u[1:2]), mean(u[3:4]), paste(sum(err.by.mar.dropbadmice.prop > 0.02), "markers with > 2% errors"), col="red") text(mean(u[1:2]), mean(u[3:4])*0.7, paste(sum(err.by.mar.dropbadmice.prop < 0.01), "markers with < 1% errors"), col="blue") @ == Study the raw genotype array information == In the last section above, we saw that there are a number of mice with very high error rates, and also a number of markers with very high error rates. In order to figure out what's going on with these markers and mice, it seemed best to look at the raw genotype information. The big genotype data file from Rosetta includes the allele intensities for each mouse at each marker on the Affymetrix genotyping arrays, as well as a quality score (which itself is of dubious quality). Via a Perl script, I pulled out just the \Sexpr{totmar(cleang)} segregating markers for all mice (except the control mouse). At each marker, I grabbed the intensities for the two alleles, and also all of the other quality-related columns. The resulting file is still quite large (about 160 Mb) and takes a while to load (about 2 min). After loading the file, we'll split it into separate data frames, one for each marker. <<load_detailed_genotypes,cache=TRUE>>= allgeno <- read.csv("Data/detailed_genotypes.csv", as.is=TRUE) allgeno <- split(allgeno, allgeno$snp) @ Let's now sort the markers and mice by their position in +cleang+. <<sort_allgeno>>= allgeno <- allgeno[markernames(cleang)] allgeno <- lapply(allgeno, function(a,b) a[match(b,a$Sample.Name),], cleang$pheno$longid) @ Since these data include not just the F~2~ mice but also the F~1~, B6 and BTBR mice, let's get imputed genotypes for all mice. <<impute_allmice>>= cleangi <- fill.geno(cleang, err=0.002, map.function="c-f", method="argmax") cg <- pull.geno(cleang, chr="-un") cgi <- pull.geno(cleangi, chr="-un") @ === The bad marker on chromosome 13 === Let's first look at that badly behaved marker on chromosome 13, \Sexpr{themar}, and the adjacent markers to the left and right. <<intensities_chr13mar,fig=TRUE,height=12>>= mnam <- markernames(cleang) wh <- which(mnam == themar) c13mar <- mnam[wh+c(-1,0,1)] par(mfrow=c(3,1), las=1) for(i in c13mar) { plot(A2.Normalized.Signal ~ A1.Normalized.Signal, data=allgeno[[i]], main=i, xlab="Allele 1 intensity", ylab="Allele 2 intensity", col=c("blue","red","green","","","orange")[allgeno[[i]]$Measured.Genotype+1], lwd=2) legend("topright", pt.lwd=2, pch=c(1,1,1,1,16), col=c("blue","red","green","orange", "hotpink"), legend=c("11","12","22","missing","error")) points(A2.Normalized.Signal ~ A1.Normalized.Signal, data=allgeno[[i]], col="hotpink", pch=16, subset=!is.na(cg[,i]) & cg[,i] != cgi[,i]) } @ It's clear, from this, why marker \Sexpr{themar} is badly behaved. The two adjacent markers show clear separation of the three clusters of genotypes, while for \Sexpr{themar}, the clusters blend together. The inferred genotyping errors, highlighted in pink in the middle figure above, were called homozygous but were probably heterozygotes that melded into one of the clusters of homoyzgotes. === The best and worst markers === Let's look at the same sort of figures for the top five and bottom five markers. <<intensities_best_and_worst,fig=TRUE,height=16>>= worst5 <- names(sort(err.by.mar.dropbadmice, decreasing=TRUE)[1:5]) best5 <- names(sort(err.by.mar.dropbadmice, decreasing=FALSE)[1:5]) par(mfrow=c(5,2), las=1) for(i in 1:5) { plot(A2.Normalized.Signal ~ A1.Normalized.Signal, data=allgeno[[worst5[i]]], main=worst5[i], xlab="Allele 1 intensity", ylab="Allele 2 intensity", col=c("blue","red","green","","","orange")[allgeno[[worst5[i]]]$Measured.Genotype+1], lwd=2) legend("topright", pt.lwd=2, pch=c(1,1,1,1,16), col=c("blue","red","green","orange", "hotpink"), legend=c("11","12","22","missing","error")) points(A2.Normalized.Signal ~ A1.Normalized.Signal, data=allgeno[[worst5[i]]], col="hotpink", pch=16, subset=!is.na(cg[,worst5[i]]) & cg[,worst5[i]] != cgi[,worst5[i]]) plot(A2.Normalized.Signal ~ A1.Normalized.Signal, data=allgeno[[best5[i]]], main=best5[i], xlab="Allele 1 intensity", ylab="Allele 2 intensity", col=c("blue","red","green","","","orange")[allgeno[[best5[i]]]$Measured.Genotype+1], lwd=2) legend("topright", pt.lwd=2, pch=c(1,1,1,1,16), col=c("blue","red","green","orange", "hotpink"), legend=c("11","12","22","missing","error")) points(A2.Normalized.Signal ~ A1.Normalized.Signal, data=allgeno[[best5[i]]], col="hotpink", pch=16, subset=!is.na(cg[,best5[i]]) & cg[,best5[i]] != cgi[,best5[i]]) } @ The figures on the left, above, correspond to the bad markers (as defined by their apparent genotyping error rate); the figures on the right correspond to the good markers. The genotyping errors all seem to derive from the same problem: the intensities for heterozygotes overlap those for the two homozygotes. === Those seven bad mice === Now let's try to figure out what's going on with those seven bad mice. I'll grab the seven bad mice and a random seven of the other ``good'' mice. We'll first look at the +"contrast"+ column, which I take to be a measure distinguishing between the two homozygous clusters. <<badmice>>= badmice <- match(as.character(f2g$pheno$longid)[err.by.ind.prop > 0.05], allgeno[[1]]$Sample.Name) f2g$pheno$MouseNum[badmice] goodmice <- sample(match(as.character(f2g$pheno$longid)[err.by.ind.prop < 0.05], allgeno[[1]]$Sample.Name), length(badmice)) contr <- sapply(allgeno, function(a,b) a$Contrast[b], c(badmice, goodmice)) @ <<badmice_contrast, fig=TRUE, height=17.5>>= par(mfrow=c(7,2), las=1) for(i in 1:7) { hist(contr[i,], breaks=seq(-1.5, 1.5, len=101), xlab="Contrast", main=cleang$pheno$MouseNum[badmice[i]]) hist(contr[i+7,], breaks=seq(-1.5, 1.5, len=101), xlab="Contrast", main=cleang$pheno$MouseNum[goodmice[i]]) } @ It looks like the ``bad'' mice are enriched for apparent homozygous genotypes. Let's look at that more directly by plotting the number of genotyping errors against the proportion of markers with +"contrast"+ < 0.5. <<numerr_v_contr, fig=TRUE, height=6>>= allcontr <- sapply(allgeno, function(a) a$Contrast) plot(apply(allcontr, 1, function(a) mean(abs(a) < 0.5)), apply(!is.na(cg) & (cg != cgi), 1, sum), col=c("black", "blue", "red","green")[as.numeric(strain)], ylab="Number of errors", xlab="Prop contrast < 0.5") legend("topright", col=c("black","blue","red","green"), pch=1, pt.lwd=2, c("F2","B6","BTBR","F1")) @ Six of the seven ``bad'' mice are at the lower range of the proportion of heterozygous genotypes. We can use the a triangle plot (aka Holmans' triangle) to depict the genotype frequencies for each mouse, as another way to indicate this aspect of the problem in these mice. <<triplot, fig=TRUE, height=8>>= library(broman) gf <- apply(pull.geno(cleang, chr=c("-un", "-X")), 1, function(a) table(factor(a, levels=1:3))/sum(!is.na(a))) triplot(labels=c("BB","BR","RR")) tripoints(gf[,-badmice], col=c("black", "blue", "red", "green")[as.numeric(strain)][-badmice], lwd=2) tripoints(gf[,badmice], col="orange", lwd=2) legend("topright", col=c("black","blue","red","green","orange"), pch=1, pt.lwd=2, c("F2","B6","BTBR","F1","bad")) @ == How to deal with the genotyping errors? == I'm inclined to drop the seven mice with high genotyping error rates and so high numbers of observed crossovers. We might further seek to omit particularly bad markers, and even omit particular genotypes that look to be errors, but I think for the markers and the individual genotyping errors, it is sufficient to rely on the hidden Markov model (HMM) with assumed genotyping error rate. So, let's just drop the seven bad mice (having genotyping error rate > 5%). <<dropbadmice>>= badmice <- names(err.by.ind.prop)[err.by.ind.prop > 0.05] badg <- subset(cleang, ind=!is.na(match(cleang$pheno$MouseNum, badmice))) cleang <- subset(cleang, ind=is.na(match(cleang$pheno$MouseNum, badmice))) f2g <- subset(cleang, ind=(cleang$pheno$Strain=="F2")) @ == Duplicate individuals == Let's look for F~2~ individuals with nearly duplicate genotype data. <<comparegeno,fig=TRUE>>= cg <- comparegeno(f2g) cgu <- cg[lower.tri(cg)] par(las=1, mar=c(5.1,4.1,0.6,0.6)) hist(cgu, main="", xlab="Proportion of matching genotypes", breaks=seq(0, 1, len=201)) rug(c(sort(cgu)[1:100], sort(cgu, dec=TRUE)[1:100]), col="blue") @ There are five pairs of mice with nearly matching genotypes. <<matching_mice>>= wh <- which(cg > 0.8, arr=TRUE) wh <- wh[wh[,1] < wh[,2],] cbind(as.character(f2g$pheno$MouseNum)[wh[,1]], as.character(f2g$pheno$MouseNum)[wh[,2]]) @ It's safe to assume that these are sample mix-ups, and that any mismatches are genotyping errors in one or the other cases. We'll omit genotypes that mismatch. <<omit_mismatching_genotypes>>= n.omit <- rep(0,nrow(wh)) for(i in 1:nrow(wh)) { for(j in 1:nchr(f2g)) { g1 <- f2g$geno[[j]]$data[wh[i,1],] g2 <- f2g$geno[[j]]$data[wh[i,2],] toomit <- !is.na(g1) & !is.na(g2) & g1 != g2 f2g$geno[[j]]$data[wh[i,1], toomit] <- NA f2g$geno[[j]]$data[wh[i,2], toomit] <- NA n.omit[i] <- n.omit[i] + sum(toomit) } } n.omit @ As seen, the five pairs have between \Sexpr{min(n.omit)} and \Sexpr{max(n.omit)} genotyping errors. == Estimated genetic map == Let's re-estimate the genetic map to see how it differs from the map from Cox et al. (_Genetics,_ 182:1335-1344, 2009). I'll use a genotype error rate of 2/1000, and the Carter-Falconer map function (which corresponds, approximately, to the high level of crossover interference in the mouse). First we should drop one mouse from each of the \Sexpr{length(n.omit)} duplicate pairs. <<drop_duplicates>>= f2g.nodup <- subset(f2g, ind=(-wh[,2])) @ Now we pull out the current map and re-estimate the map with the current data. In +est.map()+, I use +n.cluster=8+ to run multiple chromosomes in parallel (a feature added in R/qtl version 1.20-4). <<estmap,cache=TRUE>>= origmap <- pull.map(f2g.nodup, chr="-un") newmap <- est.map(f2g.nodup, chr="-un", err=0.002, map.function="c-f", n.cluster=8) @ Let's look at summaries of the original and the newly estimated maps. <<summarymap>>= cbind(summary.map(origmap), summary.map(newmap))[,c(1,5,2,6,3,7,4,8)] @ Overall, the map hasn't changed much. Let's plot the chromosome lengths against each other. <<plotchrlengths,fig=TRUE>>= par(las=1, mar=c(5.1,4.1,0.6,0.6)) plot(summary.map(origmap)[-21,2], summary.map(newmap)[-21,2], type="n", xlab="Original chr length (cM)", ylab="New chr length (cM)") abline(0,1, lty=2, col="gray") text(summary.map(origmap)[-21,2], summary.map(newmap)[-21,2], c(1:19,"X")) @ Let's plot the map next to the one embedded in the cross (which corresponds to that of Cox et al. 2009). The map on the bottom is the newly estimated one. <<plotmap, fig=TRUE>>= par(mar=c(5.1,4.1,0.6,0.6)) plot.map(origmap, newmap, horizontal=TRUE, main="") @ We can also plot the individual interval lengths against each other. It's better to plot the difference against either the average or against one of the two lengths. We'll plot the difference versus the original length (that is, in the Cox et al. maps). I've highlighted in red those intervals that increased in length by more than 4 cM; the numbers indicate the chromosome for the interval. <<plotintervallengths,fig=TRUE>>= par(las=1, mar=c(5.1,4.1,0.6,0.6)) do <- unlist(lapply(origmap, diff)) dn <- unlist(lapply(newmap, diff)) thechr <- rep(c(1:19,"X"), lapply(newmap, function(a) length(a)-1)) plot(do, dn-do, type="n", xlab="Original interval length (cM)", ylab="Change (new - old) in interval length (cM)", ylim=c(-5,15)) abline(h=0, lty=2, col="gray") points(do, (dn-do), col=c("black","red")[(dn-do>4)+1]) text(do[dn-do>4]+0.25, (dn-do)[dn-do>4], thechr[dn-do>4], col="red", adj=c(0,0.5)) @ == Load and reformat the physical map == Load and reformat the physical map, so that we can save it with the cross data. <<load_and_reformat_pmap>>= temp <- read.csv("Maps/finalmap.csv", as.is=TRUE) temp <- temp[!is.na(match(temp$marker, markernames(f2g))),] pmap <- vector("list", 20) names(pmap) <- c(1:19,"X") for(i in c(1:19,"X")) { pmap[[i]] <- temp[temp[,2]==i,3]/10^6 names(pmap[[i]]) <- temp[temp[,2]==i,1] class(pmap[[i]]) <- ifelse(i=="X", "X", "A") } class(pmap) <- "map" @ Are there really the same number of markers per chromosome and are the markers in the same order? <<check_same_order>>= all(sapply(pmap, length) == nmar(f2g["-un",])) all(unlist(lapply(pmap, names)) == markernames(f2g["-un",])) @ == Save the clean cross object == <<savecross>>= save(cleang, f2g, newmap, pmap, file="Data/clean_cross.RData") save(badg, file="Data/bad_mice.RData") @