Changed calculation of rank sum so it's now the median of the per-sample rank sums (because the rank sum across all sample data did worse for biallelics). Now only called alt for 0/1s has rank sum, similar to old paradigm. Combined gVCF rank sums stored in Histogram, then median is taken once during genotyping. Histogram uses Integer keys to eliminate numerical precision mess

This commit is contained in:
Laura Gauthier 2016-09-28 08:41:20 -04:00
parent d512f3f808
commit 35d0b72c79
7 changed files with 431 additions and 82 deletions

View File

@ -60,6 +60,7 @@ import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ReducibleAnnotation;
import org.broadinstitute.gatk.utils.MannWhitneyU;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
@ -78,10 +79,11 @@ import java.util.*;
*/
public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnnotation {
private final static Logger logger = Logger.getLogger(AS_RMSAnnotation.class);
protected final String splitDelim = "\\|"; //String.split takes a regex, so we need to escape the pipe
protected final String printDelim = "|";
protected final String reducedDelim = ",";
protected AnnotatorCompatible callingWalker;
private final String splitDelim = "\\|"; //String.split takes a regex, so we need to escape the pipe
private final String printDelim = "|";
private final String reducedDelim = ","; //delimiter for list of reduced values
private final String rawDelim = ","; //delimiter for list of raw values
private AnnotatorCompatible callingWalker;
@Override
public void initialize(final AnnotatorCompatible walker, final GenomeAnalysisEngine toolkit, final Set<VCFHeaderLine> headerLines) {
@ -109,20 +111,21 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
return new HashMap<>();
final Map<String, Object> annotations = new HashMap<>();
final AlleleSpecificAnnotationData<CompressedDataList<Integer>> myData = initializeNewAnnotationData(vc.getAlleles());
calculateRawData(vc, perReadAlleleLikelihoodMap, myData);
final String annotationString = makeRawAnnotationString(vc.getAlleles(), myData.getAttributeMap());
final AlleleSpecificAnnotationData<CompressedDataList<Integer>> myRawData = initializeNewRawAnnotationData(vc.getAlleles());
calculateRawData(vc, perReadAlleleLikelihoodMap, myRawData);
Map<Allele, List<Double>> myRankSumStats = calculateRankSum(myRawData.getAttributeMap(), myRawData.getRefAllele());
final String annotationString = makeRawAnnotationString(vc.getAlleles(), myRankSumStats);
annotations.put(getRawKeyName(), annotationString);
return annotations;
}
protected void parseRawDataString(final ReducibleAnnotationData<CompressedDataList<Integer>> myData) {
protected void parseRawDataString(final ReducibleAnnotationData<Histogram> myData) {
final String rawDataString = myData.getRawData();
String rawDataNoBrackets;
final Map<Allele, CompressedDataList<Integer>> perAlleleValues = new HashMap<>();
final Map<Allele, Histogram> perAlleleValues = new HashMap<>();
//Initialize maps
for (final Allele current : myData.getAlleles()) {
perAlleleValues.put(current, new CompressedDataList<Integer>());
perAlleleValues.put(current, new Histogram());
}
//Map gives back list with []
if (rawDataString.charAt(0) == '[') {
@ -131,40 +134,61 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
else {
rawDataNoBrackets = rawDataString;
}
//rawDataPerAllele is the list of values for each allele (each of variable length)
//rawDataPerAllele is a per-sample list of the rank sum statistic for each allele
final String[] rawDataPerAllele = rawDataNoBrackets.split(splitDelim);
for (int i=0; i<rawDataPerAllele.length; i++) {
final String alleleData = rawDataPerAllele[i];
if (alleleData.isEmpty())
continue;
final CompressedDataList<Integer> alleleList = perAlleleValues.get(myData.getAlleles().get(i));
final Histogram alleleList = perAlleleValues.get(myData.getAlleles().get(i));
final String[] rawListEntriesAsStringVector = alleleData.split(",");
if (rawListEntriesAsStringVector.length %2 != 0)
throw new GATKException("ERROR: rank sum test raw annotation data must occur in <value,count> pairs");
for (int j=0; j<rawListEntriesAsStringVector.length; j+=2) {
int value, count;
for (int j=0; j<rawListEntriesAsStringVector.length; j++) {
Double value;
if (!rawListEntriesAsStringVector[j].isEmpty()) {
value = Integer.parseInt(rawListEntriesAsStringVector[j].trim());
value = Double.parseDouble(rawListEntriesAsStringVector[j].trim());
if(!value.isNaN())
alleleList.add(value);
}
}
}
myData.setAttributeMap(perAlleleValues);
myData.validateAllelesList();
}
protected void parseCombinedDataString(final ReducibleAnnotationData<Histogram> myData) {
final String rawDataString = myData.getRawData();
String rawDataNoBrackets;
final Map<Allele, Histogram> perAlleleValues = new HashMap<>();
//Initialize maps
for (final Allele current : myData.getAlleles()) {
perAlleleValues.put(current, new Histogram());
}
//Map gives back list with []
if (rawDataString.charAt(0) == '[') {
rawDataNoBrackets = rawDataString.substring(1, rawDataString.length() - 1);
}
else {
rawDataNoBrackets = rawDataString;
}
//rawDataPerAllele is a string representation of the Histogram for each allele in the form value, count, value, count...
final String[] rawDataPerAllele = rawDataNoBrackets.split(splitDelim);
for (int i=0; i<rawDataPerAllele.length; i++) {
final String alleleData = rawDataPerAllele[i];
final Histogram alleleList = perAlleleValues.get(myData.getAlleles().get(i));
final String[] rawListEntriesAsStringVector = alleleData.split(rawDelim);
for (int j=0; j<rawListEntriesAsStringVector.length; j+=2) {
Double value;
int count;
if (!rawListEntriesAsStringVector[j].isEmpty()) {
value = Double.parseDouble(rawListEntriesAsStringVector[j].trim());
if (!rawListEntriesAsStringVector[j + 1].isEmpty()) {
count = Integer.parseInt(rawListEntriesAsStringVector[j + 1].trim());
alleleList.add(value,count);
if(!value.isNaN())
alleleList.add(value,count);
}
}
}
}
myData.setAttributeMap(perAlleleValues);
//check the alleles list
boolean foundRef = false;
for (final Allele a : myData.getAlleles()) {
if (a.isReference()) {
if (foundRef)
throw new GATKException("ERROR: multiple reference alleles found in annotation data\n");
foundRef = true;
}
}
if (!foundRef)
throw new GATKException("ERROR: no reference alleles found in annotation data\n");
myData.validateAllelesList();
}
@Override
@ -178,43 +202,76 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
}
final Map<String, Object> annotations = new HashMap<>();
final String annotationString = makeRawAnnotationString(vcAlleles, combinedData.getAttributeMap());
final String annotationString = makeCombinedAnnotationString(vcAlleles, combinedData.getAttributeMap());
annotations.put(getRawKeyName(), annotationString);
return annotations;
}
protected AlleleSpecificAnnotationData initializeNewAnnotationData(final List<Allele> vcAlleles) {
protected AlleleSpecificAnnotationData initializeNewRawAnnotationData(final List<Allele> vcAlleles) {
Map<Allele, CompressedDataList<Integer>> perAlleleValues = new HashMap<>();
for (Allele a : vcAlleles) {
perAlleleValues.put(a, new CompressedDataList<Integer>());
}
AlleleSpecificAnnotationData ret = new AlleleSpecificAnnotationData(vcAlleles, perAlleleValues.toString());
final AlleleSpecificAnnotationData ret = new AlleleSpecificAnnotationData(vcAlleles, perAlleleValues.toString());
ret.setAttributeMap(perAlleleValues);
return ret;
}
protected void combineAttributeMap(final ReducibleAnnotationData<CompressedDataList<Integer>> toAdd, final ReducibleAnnotationData<CompressedDataList<Integer>> combined) {
protected AlleleSpecificAnnotationData initializeNewAnnotationData(final List<Allele> vcAlleles) {
Map<Allele, Histogram> perAlleleValues = new HashMap<>();
for (Allele a : vcAlleles) {
perAlleleValues.put(a, new Histogram());
}
final AlleleSpecificAnnotationData ret = new AlleleSpecificAnnotationData(vcAlleles, perAlleleValues.toString());
ret.setAttributeMap(perAlleleValues);
return ret;
}
protected void combineAttributeMap(final ReducibleAnnotationData<Histogram> toAdd, final ReducibleAnnotationData<Histogram> combined) {
for (final Allele a : combined.getAlleles()) {
if (toAdd.hasAttribute(a)) {
final CompressedDataList<Integer> alleleData = combined.getAttribute(a);
alleleData.add(toAdd.getAttribute(a));
combined.putAttribute(a, alleleData);
final Histogram alleleData = combined.getAttribute(a);
if (toAdd.getAttribute(a) != null) {
alleleData.add(toAdd.getAttribute(a));
combined.putAttribute(a, alleleData);
}
}
}
}
protected String makeRawAnnotationString(final List<Allele> vcAlleles, final Map<Allele, CompressedDataList<Integer>> perAlleleValues) {
protected String makeRawAnnotationString(final List<Allele> vcAlleles, final Map<Allele, List<Double>> perAlleleValues) {
String annotationString = "";
for (int i =0; i< vcAlleles.size(); i++) {
if (i!=0)
annotationString += printDelim;
CompressedDataList<Integer> alleleValues = perAlleleValues.get(vcAlleles.get(i));
annotationString += alleleValues.toString();
for (int i = 0; i< vcAlleles.size(); i++) {
if (vcAlleles.get(i).isReference())
continue;
if (i != 0)
annotationString += printDelim;
final List<Double> alleleValue = perAlleleValues.get(vcAlleles.get(i));
//can be null if there are no ref reads
if (alleleValue == null)
continue;
annotationString += formatListAsString(alleleValue);
}
return annotationString;
}
protected String makeReducedAnnotationString(VariantContext vc, Map<Allele,Double> perAltRankSumResults) {
protected String makeCombinedAnnotationString(final List<Allele> vcAlleles, final Map<Allele, Histogram> perAlleleValues) {
String annotationString = "";
for (int i = 0; i< vcAlleles.size(); i++) {
if (vcAlleles.get(i).isReference())
continue;
if (i != 0)
annotationString += printDelim;
final Histogram alleleValue = perAlleleValues.get(vcAlleles.get(i));
//can be null if there are no ref reads
if (alleleValue == null)
continue;
annotationString += alleleValue.toString();
}
return annotationString;
}
private String makeReducedAnnotationString(final VariantContext vc, final Map<Allele,Double> perAltRankSumResults) {
String annotationString = "";
for (final Allele a : vc.getAlternateAlleles()) {
if (!annotationString.isEmpty())
@ -242,8 +299,8 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
return new HashMap<>();
final Map<String,Object> annotations = new HashMap<>();
final AlleleSpecificAnnotationData<CompressedDataList<Integer>> myData = new AlleleSpecificAnnotationData(originalVC.getAlleles(), rawRankSumData);
parseRawDataString(myData);
final AlleleSpecificAnnotationData<Histogram> myData = new AlleleSpecificAnnotationData(originalVC.getAlleles(), rawRankSumData);
parseCombinedDataString(myData);
final Map<Allele, Double> perAltRankSumResults = calculateReducedData(myData.getAttributeMap(), myData.getRefAllele());
//shortcut for no ref values
@ -254,27 +311,37 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
return annotations;
}
public void calculateRawData(VariantContext vc, Map<String, PerReadAlleleLikelihoodMap> pralm, ReducibleAnnotationData myData) {
public void calculateRawData(final VariantContext vc, final Map<String, PerReadAlleleLikelihoodMap> pralm, final ReducibleAnnotationData myData) {
if( vc.getGenotypes().getSampleNames().size() != 1)
throw new IllegalStateException("Calculating raw data for allele-specific rank sums requires variant context input with exactly one sample, as in a gVCF.");
if(pralm == null)
return;
final Map<Allele, CompressedDataList<Integer>> perAlleleValues = myData.getAttributeMap();
for ( final PerReadAlleleLikelihoodMap likelihoodMap : pralm.values() ) {
if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) {
fillQualsFromLikelihoodMap(vc.getAlleles(), vc.getStart(), likelihoodMap, perAlleleValues);
fillQualsFromLikelihoodMap(vc.getGenotype(0).getAlleles(), vc.getStart(), likelihoodMap, perAlleleValues);
}
}
}
/**
*
* @param alleles the alleles that were called in the genotype for this sample
* @param refLoc
* @param likelihoodMap
* @param perAlleleValues
*/
private void fillQualsFromLikelihoodMap(final List<Allele> alleles,
final int refLoc,
final PerReadAlleleLikelihoodMap likelihoodMap,
final Map<Allele, CompressedDataList<Integer>> perAlleleValues) {
for ( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet() ) {
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
if ( ! a.isInformative() )
continue; // read is non-informative
if ( ! a.isInformative() || ! alleles.contains(a.getMostLikelyAllele()))
continue; // read is non-informative or supports an allele that was not called
final GATKSAMRecord read = el.getKey();
if ( isUsableRead(read, refLoc) ) {
@ -289,8 +356,8 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
}
}
public Map<Allele, Double> calculateReducedData(final Map<Allele, CompressedDataList<Integer>> perAlleleValues, final Allele ref) {
final Map<Allele, Double> perAltRankSumResults = new HashMap<>();
public Map<Allele, List<Double>> calculateRankSum(final Map<Allele, CompressedDataList<Integer>> perAlleleValues, final Allele ref) {
final Map<Allele, List<Double>> perAltRankSumResults = new HashMap<>();
//shortcut to not try to calculate rank sum if there are no reads that unambiguously support the ref
if (perAlleleValues.get(ref).isEmpty())
return perAltRankSumResults;
@ -299,12 +366,12 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
continue;
final MannWhitneyU mannWhitneyU = new MannWhitneyU();
//load alts (series 1)
List<Double> alts = new ArrayList<>();
final List<Double> alts = new ArrayList<>();
for (final Number qual : perAlleleValues.get(alt)) {
alts.add((double) qual.intValue());
}
//load refs (series 2)
List<Double> refs = new ArrayList<>();
final List<Double> refs = new ArrayList<>();
for (final Number qual : perAlleleValues.get(ref)) {
refs.add((double) qual.intValue());
}
@ -322,9 +389,31 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
}
// we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases)
final MannWhitneyU.Result result = mannWhitneyU.test(convertToArray(alts), convertToArray(refs), MannWhitneyU.TestType.FIRST_DOMINATES);
perAltRankSumResults.put(alt, result.getZ());
perAltRankSumResults.put(alt, Collections.singletonList(result.getZ()));
}
return perAltRankSumResults;
}
public Map<Allele, Double> calculateReducedData(final Map<Allele, Histogram> perAlleleValues, final Allele ref) {
final Map<Allele, Double> perAltRankSumResults = new HashMap<>();
for (final Allele alt : perAlleleValues.keySet()) {
if (alt.equals(ref, false))
continue;
if (perAlleleValues.get(alt) != null)
perAltRankSumResults.put(alt, perAlleleValues.get(alt).median());
}
return perAltRankSumResults;
}
public String formatListAsString(final List<Double> rankSumValues) {
String formattedString = "";
for (int i=0; i<rankSumValues.size(); i++) {
if(i!=0)
formattedString += reducedDelim;
//we can get NaNs if one of the ref or alt lists is empty (e.g. homVar genotypes), but VQSR will process them appropriately downstream
formattedString += String.format("%.3f", rankSumValues.get(i));
}
return formattedString;
}
}

View File

@ -333,7 +333,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("5877ccbc99bbaffbcd5fe3aaa3d7e7f7"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("7e74286e2c412855509ea5312ea0ec57"));
spec.disableShadowBCF();
executeTest(" testAlleleSpecificAnnotations", spec);
}
@ -342,7 +342,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASMQMateRankSumAnnotation() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A AS_MQMateRankSumTest --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("0381fec3b0d21508b28fa62c2a61ccfc"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("e0b9135f68eb0b65cb36721dd3f40f00"));
spec.disableShadowBCF();
executeTest(" testASMQMateRankSumAnnotation", spec);
}
@ -351,7 +351,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testBetaTestingAnnotationGroup() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G BetaTesting --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("f248a6c4a7645dc5cc9f5ec9f81d9ad5"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("81f1fcddfec262a8aaedb9ea37d89873"));
spec.disableShadowBCF();
executeTest(" testASMQMateRankSumAnnotation", spec);
}
@ -360,7 +360,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASInsertSizeRankSum() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("4599a591427c188c117f09ac40cc866f"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("96934396683234559827a77c9bb38e21"));
spec.disableShadowBCF();
executeTest(" testASInsertSizeRankSum", spec);
}
@ -464,7 +464,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testHaplotypeCallerGVCSpanDel() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:26357667 -ERC GVCF --no_cmdline_in_header -A AS_ReadPosRankSumTest -A ReadPosRankSumTest -variant_index_type %s -variant_index_parameter %d",
b37KGReference, privateTestDir + "NexPond-377866-1:26357600-26357700.bam", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("bb12cf2dfa6f1fa0692395e295792584"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("db29a87113b358805e18ac8c20f9145d"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerGVCSpanDel", spec);
}

View File

@ -276,8 +276,8 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testAlleleSpecificAnnotations() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("f6a7fa62c33de963c55262820effe44a"));
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("f053aaa50427ea3fa9a43240e56eefc1"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}
@ -285,17 +285,17 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testASMateRankSumAnnotation() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_MQMateRankSumTest -V "
+ privateTestDir + "NA12878.AS.MateRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.MateRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("2c264ed0057c93276c647f55998c4f25"));
+ privateTestDir + "NA12878.AS.MateRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.MateRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("951b59bf05eb4401da6a1bc8b2cdc261"));
spec.disableShadowBCF();
executeTest("testASMateRankSumAnnotation", spec);
}
@Test
public void testASInsertSizeRankSumAnnotation() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -V "
+ privateTestDir + "NA12878.AS.InsertSizeRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.InsertSizeRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("ff5ca958e81e406cfe010d5649b5c0d1"));
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_InsertSizeRankSum -V "
+ privateTestDir + "NA12878.AS.InsertSizeRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.InsertSizeRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("9df5f327028768e4c5d20e5742d6d0ad"));
spec.disableShadowBCF();
executeTest("testASInsertSizeRankSumAnnotation", spec);
}

View File

@ -598,8 +598,8 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
@Test
public void testAlleleSpecificAnnotations() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("29d6db0a93abd72d64fb1e82da65c715"));
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("f8ea6b63ea0bf7b0fdc266845762b927"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}
@ -607,17 +607,17 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
@Test
public void testASMateRankSumAnnotation() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_MQMateRankSumTest --disableDithering -V "
+ privateTestDir + "NA12878.AS.MateRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.MateRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("2a330015a7db9f9aee9bc5b776698f73"));
+ privateTestDir + "NA12878.AS.MateRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.MateRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("a3fc424534c31b43bdaaa4de3803f73b"));
spec.disableShadowBCF();
executeTest("testASMateRankSumAnnotation", spec);
}
@Test
public void testASInsertSizeRankSumAnnotation() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "NA12878.AS.InsertSizeRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.InsertSizeRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("75aee1e0c8c3528180e344ec6c0d8ffd"));
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_InsertSizeRankSum --disableDithering -V "
+ privateTestDir + "NA12878.AS.InsertSizeRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.InsertSizeRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("55e14518c955de9158d98e55da75b643"));
spec.disableShadowBCF();
executeTest("testASInsertSizeRankSumAnnotation", spec);
}
@ -630,19 +630,19 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations_oneSample() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("f4fa3acec2b21037368898e913b7a3fa"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("78c3ed48b17faebf77f4cc9f225b9045"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations_oneSample", spec);
}
@Test
//do at least 10 samples so InbreedingCoeff and AS_InbreedingCoeff are output
public void testAlleleSpecificAnnotations_elevenSamples() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
public void testAlleleSpecificAnnotations_moreSamples() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_InsertSizeRankSum -A AS_MQMateRankSumTest --disableDithering -V "
+ privateTestDir + "multiSamples.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("4e90f6908248fac9b3ce3e545180a8e5"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("5b007393f598cde9f18b03d4ac5cab25"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations_elevenSamples", spec);
executeTest("testAlleleSpecificAnnotations_moreSamples", spec);
}
@Test
@ -658,7 +658,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testFractionInformativeReads() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -G AS_Standard -o %s --no_cmdline_in_header -A FractionInformativeReads --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("0b1bbcc7d24f8b0945c97907b1cdd974"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("e5db4edf2d2b2abdb0b07a57d658785b"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}

View File

@ -0,0 +1,143 @@
/*
* Copyright 2012-2016 Broad Institute, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import java.util.Arrays;
/**
* Created by gauthier on 11/1/16.
*/
public class Histogram {
private Double binSize;
private String precisionFormat;
private String printDelim;
final private Double BIN_EPSILON = 0.01;
private CompressedDataList<Integer> dataList = new CompressedDataList<>();
public Histogram() {
this.binSize = 0.1;
precisionFormat = "%.1f";
}
public Histogram(final Double binSize) {
this.binSize = binSize;
precisionFormat = "%." + Math.round(-Math.log10(binSize)) + "f";
}
public void add(final Double d) {
long binKey = getBinnedValue(d);
if (isValidBinKey(binKey))
dataList.add((int)binKey);
else
throw new GATKException("Histogram values are suspiciously extreme. Failed to add " + d + " to the Histogram.");
}
public void add(final Double d, final int count) {
if (count < 1)
throw new GATKException("Cannot add non-positive counts to Histogram.");
long binKey = getBinnedValue(d);
if (isValidBinKey(binKey))
dataList.add((int)binKey, count);
else
throw new GATKException("Histogram values are suspiciously extreme. Failed to add " + d + " to the Histogram.");
}
public void add(final Histogram h) {
if (!this.binSize.equals(h.binSize))
throw new GATKException("Histogram bin sizes are mismatched -- cannot add bin size " + this.binSize + " to " + h.binSize);
this.dataList.add(h.dataList);
}
public Integer get(final Double d) {
long binKey = getBinnedValue(d);
if (isValidBinKey(binKey))
return dataList.getValueCounts().get((int)binKey);
else
throw new GATKException("Requested value is suspiciously extreme. Failed to retrieve " + d + " from the Histogram.");
}
/**
*
* @return may be null if Histogram is empty
*/
public Double median() {
int numItems = 0;
for(final int count : dataList.valueCounts.values()) {
numItems += count;
}
boolean oddNumberValues = true;
if(numItems % 2 == 0)
oddNumberValues = false;
int medianIndex = (numItems+1)/2;
int counter = 0;
Double firstMedian = null;
for(final Integer key : dataList.valueCounts.keySet()) {
counter += dataList.valueCounts.get(key);
if( counter > medianIndex) {
if (firstMedian == null)
return key*binSize;
else {
return (firstMedian+key)/2.0*binSize;
}
}
if( counter == medianIndex) {
if (oddNumberValues)
return key*binSize;
else {
firstMedian = (double) key;
}
}
}
return null;
}
private long getBinnedValue(double d) {
return Math.round(Math.floor((d+BIN_EPSILON*binSize)/binSize)); //add a little epsilon before division so values exactly on bin boundaries will stay in the same bin
}
private boolean isValidBinKey(long binnedValue) {
return binnedValue <= Integer.MAX_VALUE && binnedValue >= Integer.MIN_VALUE;
}
@Override
public String toString(){
printDelim = ",";
String str = "";
Object[] keys = dataList.valueCounts.keySet().toArray();
Arrays.sort(keys);
for (Object i: keys){
if(!str.isEmpty())
str+=printDelim;
str+=(String.format(precisionFormat,(double)(int)i*binSize)+printDelim+dataList.valueCounts.get(i)); //key i needs to be output with specified precision
}
return str;
}
}

View File

@ -26,6 +26,7 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import java.util.*;
@ -102,4 +103,17 @@ public class ReducibleAnnotationData<T> {
*/
public Map<Allele, T> getAttributeMap() {return Collections.unmodifiableMap(attributeMap);}
public void validateAllelesList() {
boolean foundRef = false;
for (final Allele a : this.getAlleles()) {
if (a.isReference()) {
if (foundRef)
throw new GATKException("ERROR: multiple reference alleles found in annotation data\n");
foundRef = true;
}
}
if (!foundRef)
throw new GATKException("ERROR: no reference alleles found in annotation data\n");
}
}

View File

@ -0,0 +1,103 @@
/*
* Copyright 2012-2016 Broad Institute, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.testng.Assert;
import org.testng.annotations.Test;
/**
* Created by gauthier on 11/1/16.
*/
public class HistogramUnitTest {
private final Double EPSILON = 0.001;
@Test
public void testAdd() throws Exception {
Histogram bimodalHist = new Histogram();
for(int i=0; i<=100; i++) {
bimodalHist.add(1+i/1000.0);
}
Assert.assertEquals(bimodalHist.get(1.0), new Integer(100), "");
Assert.assertEquals(bimodalHist.get(1.1), new Integer(1), "");
}
@Test
public void testAddingQuantizedValues() throws Exception {
Histogram hist = new Histogram();
for(int i=0; i<100; i++) {
hist.add(1.2);
}
Assert.assertEquals(hist.get(1.2), new Integer(100));
Assert.assertEquals(hist.median(), 1.2, EPSILON);
}
@Test
public void testBulkAdd() throws Exception {
Histogram bimodalHist = new Histogram();
for(int i=0; i<=100; i++) {
bimodalHist.add(1+i/1000.0, 2);
}
Assert.assertEquals(bimodalHist.get(1.0), new Integer(200), "");
Assert.assertEquals(bimodalHist.get(1.1), new Integer(2), "");
}
@Test
public void testMedianOfEvens() throws Exception {
Histogram bimodalHist = new Histogram();
for(int i = 0; i<10; i++) {
bimodalHist.add(10.0);
bimodalHist.add(20.0);
}
Assert.assertEquals(bimodalHist.median(), 15.0, EPSILON, "");
}
@Test
public void testMedianOfOdds() throws Exception {
Histogram bimodalHist = new Histogram();
for(int i = 0; i<10; i++) {
bimodalHist.add(10.0);
bimodalHist.add(20.0);
}
bimodalHist.add(20.0);
Assert.assertEquals(bimodalHist.median(), 20.0, EPSILON, "");
}
@Test
public void testMedianOfEmptyHist() throws Exception {
Histogram empty = new Histogram();
Assert.assertNull(empty.median());
}
@Test
public void testMedianOfSingleItem() throws Exception {
Histogram singleItem = new Histogram();
singleItem.add(20.0);
Assert.assertEquals(singleItem.median(), 20.0, EPSILON, "");
}
}