Added gsa.reshape.concordance.table function to gsalib

This commit is contained in:
Phillip Dexheimer 2015-02-04 23:02:54 -05:00
parent 517320092c
commit 4d4d33404e
6 changed files with 75 additions and 6 deletions

View File

@ -82,7 +82,7 @@ import java.util.*;
* <p>
* It may be informative to reshape rows of the GenotypeConcordance counts and proportions tables into separate row-major tables
* where the columns indicate the COMP genotype and the rows indicate the EVAL genotype for easy comparison between the
* two callsets. This can be done with a command similar to d <- matrix(sampleRow,nrow=6,byrow=T) in R where sampleRow is the 36-value row corresponding to the sample of interest, excluding "Mismatching_Alleles".
* two callsets. This can be done with the gsa.reshape.concordance.table function in the gsalib R library.
* In Excel this can be accomplished using the OFFSET function.
* </p>
* <ul>

View File

@ -1,8 +1,8 @@
Package: gsalib
Type: Package
Title: Utility Functions For GATK
Version: 2.1
Date: 2014-12-09
Version: 2.2
Date: 2015-03-17
Author: Kiran Garimella
Maintainer: Geraldine Van der Auwera <vdauwera@broadinstitute.org>
Description: This package contains utility functions used by the Genome Analysis Toolkit (GATK) to load tables and plot data. The GATK is a toolkit for variant discovery in high-throughput sequencing data.

View File

@ -1 +1,2 @@
export(gsa.read.gatkreport)
export(gsa.read.gatkreport)
export(gsa.reshape.concordance.table)

View File

@ -0,0 +1,20 @@
gsa.reshape.concordance.table <- function(data, table.name="GenotypeConcordance_Counts", sample.name="ALL") {
if (!is.null(table.name)) {
data <- data[[table.name]]
}
if (is.null(data)) {
return NULL
}
d <- data[data$Sample==sample.name,2:(length(data[1,])-1)]
possible.genotypes <- c('NO_CALL', 'HOM_REF', 'HET', 'HOM_VAR', 'UNAVAILABLE', 'MIXED')
combinations <- outer(possible.genotypes, possible.genotypes, function(a,b) {paste(a,b,sep='_')})
existing.combi <- matrix(combinations %in% colnames(d), nrow=length(possible.genotypes))
eval.genotypes <- apply(existing.combi, 1, any)
comp.genotypes <- apply(existing.combi, 2, any)
m <- matrix(d, nrow=sum(eval.genotypes), byrow=T)
dimnames(m) <- list(EvalGenotypes=possible.genotypes[eval.genotypes],
CompGenotypes=possible.genotypes[comp.genotypes])
m
}

View File

@ -0,0 +1,48 @@
\name{gsa.reshape.concordance.table}
\alias{gsa.reshape.concordance.table}
\title{
Reshape a Concordance Table
}
\description{
Given a GATKReport generated by GenotypeConcordance (as output by \code{gsa.read.gatkreport}), this function reshapes the concordance for a specified sample into a matrix with the EvalGenotypes in rows and the CompGenotypes in columns (see the documentation for GenotypeConcordance for the definition of Eval and Comp)
}
\usage{
gsa.reshape.concordance.table(x, table="GenotypeConcordance_Counts", sample.name="ALL")
}
\arguments{
\item{x}{
A GATKReport as output by \code{gsa.read.gatkreport}. If \code{table} is \code{NULL}, \code{x} is assumed to be the vector of concordance values to reshape.
}
\item{table}{
The table name in the GATKReport to reshape. Defaults to "GenotypeConcordance_Counts", but could also be one of the proportion tables ("GenotypeConcordance_EvalProportions", "GenotypeConcordance_CompProportions"). This value can also be \code{NULL}, in which case \code{x} is reshaped directly.
}
\item{sample.name}{
The sample name within \code{table} to use.
}
}
\value{
Returns a two-dimensional matrix with Eval genotypes in the rows and Comp genotypes in the columns. The genotypes themselves (\code{HOM_REF}, \code{NO_CALL}, etc) are specified in the row/col names of the matrix.
}
\author{
Phillip Dexheimer
}
\seealso{
\code{\link{gsa.read.gatkreport}}
}
\examples{
test_file = system.file("inst", "extdata", "test_gatkreport.table", package = "gsalib")
report = gsa.read.gatkreport(test_file)
gsa.reshape.concordance.table(report)
## Output looks like:
## CompGenotypes
##EvalGenotypes NO_CALL HOM_REF HET HOM_VAR UNAVAILABLE MIXED
## NO_CALL 0 0 0 0 0 0
## HOM_REF 0 2 0 0 0 0
## HET 0 3 0 0 0 0
## HOM_VAR 0 2 0 0 0 0
## UNAVAILABLE 0 0 0 0 0 0
## MIXED 0 0 0 0 0 0
}
\keyword{ manip }

View File

@ -12,8 +12,8 @@ Utility functions for analysis of genome sequence data with the GATK
\tabular{ll}{
Package: \tab gsalib\cr
Type: \tab Package\cr
Version: \tab 2.1\cr
Date: \tab 2014-12-09\cr
Version: \tab 2.2\cr
Date: \tab 2015-03-17\cr
License: \tab MIT\cr
LazyLoad: \tab yes\cr
}