Merge pull request #1541 from broadinstitute/ldg_revertToMedianRankSum

Changed calculation of allele-specific rank sum so it's now the median of the per-sam…
This commit is contained in:
ldgauthier 2017-05-10 14:43:08 -04:00 committed by GitHub
commit ae174d3359
7 changed files with 431 additions and 82 deletions

View File

@ -60,6 +60,7 @@ import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ReducibleAnnotation;
import org.broadinstitute.gatk.utils.MannWhitneyU;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
@ -78,10 +79,11 @@ import java.util.*;
*/
public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnnotation {
private final static Logger logger = Logger.getLogger(AS_RMSAnnotation.class);
protected final String splitDelim = "\\|"; //String.split takes a regex, so we need to escape the pipe
protected final String printDelim = "|";
protected final String reducedDelim = ",";
protected AnnotatorCompatible callingWalker;
private final String splitDelim = "\\|"; //String.split takes a regex, so we need to escape the pipe
private final String printDelim = "|";
private final String reducedDelim = ","; //delimiter for list of reduced values
private final String rawDelim = ","; //delimiter for list of raw values
private AnnotatorCompatible callingWalker;
@Override
public void initialize(final AnnotatorCompatible walker, final GenomeAnalysisEngine toolkit, final Set<VCFHeaderLine> headerLines) {
@ -109,20 +111,21 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
return new HashMap<>();
final Map<String, Object> annotations = new HashMap<>();
final AlleleSpecificAnnotationData<CompressedDataList<Integer>> myData = initializeNewAnnotationData(vc.getAlleles());
calculateRawData(vc, perReadAlleleLikelihoodMap, myData);
final String annotationString = makeRawAnnotationString(vc.getAlleles(), myData.getAttributeMap());
final AlleleSpecificAnnotationData<CompressedDataList<Integer>> myRawData = initializeNewRawAnnotationData(vc.getAlleles());
calculateRawData(vc, perReadAlleleLikelihoodMap, myRawData);
Map<Allele, List<Double>> myRankSumStats = calculateRankSum(myRawData.getAttributeMap(), myRawData.getRefAllele());
final String annotationString = makeRawAnnotationString(vc.getAlleles(), myRankSumStats);
annotations.put(getRawKeyName(), annotationString);
return annotations;
}
protected void parseRawDataString(final ReducibleAnnotationData<CompressedDataList<Integer>> myData) {
protected void parseRawDataString(final ReducibleAnnotationData<Histogram> myData) {
final String rawDataString = myData.getRawData();
String rawDataNoBrackets;
final Map<Allele, CompressedDataList<Integer>> perAlleleValues = new HashMap<>();
final Map<Allele, Histogram> perAlleleValues = new HashMap<>();
//Initialize maps
for (final Allele current : myData.getAlleles()) {
perAlleleValues.put(current, new CompressedDataList<Integer>());
perAlleleValues.put(current, new Histogram());
}
//Map gives back list with []
if (rawDataString.charAt(0) == '[') {
@ -131,40 +134,61 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
else {
rawDataNoBrackets = rawDataString;
}
//rawDataPerAllele is the list of values for each allele (each of variable length)
//rawDataPerAllele is a per-sample list of the rank sum statistic for each allele
final String[] rawDataPerAllele = rawDataNoBrackets.split(splitDelim);
for (int i=0; i<rawDataPerAllele.length; i++) {
final String alleleData = rawDataPerAllele[i];
if (alleleData.isEmpty())
continue;
final CompressedDataList<Integer> alleleList = perAlleleValues.get(myData.getAlleles().get(i));
final Histogram alleleList = perAlleleValues.get(myData.getAlleles().get(i));
final String[] rawListEntriesAsStringVector = alleleData.split(",");
if (rawListEntriesAsStringVector.length %2 != 0)
throw new GATKException("ERROR: rank sum test raw annotation data must occur in <value,count> pairs");
for (int j=0; j<rawListEntriesAsStringVector.length; j+=2) {
int value, count;
for (int j=0; j<rawListEntriesAsStringVector.length; j++) {
Double value;
if (!rawListEntriesAsStringVector[j].isEmpty()) {
value = Integer.parseInt(rawListEntriesAsStringVector[j].trim());
value = Double.parseDouble(rawListEntriesAsStringVector[j].trim());
if(!value.isNaN())
alleleList.add(value);
}
}
}
myData.setAttributeMap(perAlleleValues);
myData.validateAllelesList();
}
protected void parseCombinedDataString(final ReducibleAnnotationData<Histogram> myData) {
final String rawDataString = myData.getRawData();
String rawDataNoBrackets;
final Map<Allele, Histogram> perAlleleValues = new HashMap<>();
//Initialize maps
for (final Allele current : myData.getAlleles()) {
perAlleleValues.put(current, new Histogram());
}
//Map gives back list with []
if (rawDataString.charAt(0) == '[') {
rawDataNoBrackets = rawDataString.substring(1, rawDataString.length() - 1);
}
else {
rawDataNoBrackets = rawDataString;
}
//rawDataPerAllele is a string representation of the Histogram for each allele in the form value, count, value, count...
final String[] rawDataPerAllele = rawDataNoBrackets.split(splitDelim);
for (int i=0; i<rawDataPerAllele.length; i++) {
final String alleleData = rawDataPerAllele[i];
final Histogram alleleList = perAlleleValues.get(myData.getAlleles().get(i));
final String[] rawListEntriesAsStringVector = alleleData.split(rawDelim);
for (int j=0; j<rawListEntriesAsStringVector.length; j+=2) {
Double value;
int count;
if (!rawListEntriesAsStringVector[j].isEmpty()) {
value = Double.parseDouble(rawListEntriesAsStringVector[j].trim());
if (!rawListEntriesAsStringVector[j + 1].isEmpty()) {
count = Integer.parseInt(rawListEntriesAsStringVector[j + 1].trim());
alleleList.add(value,count);
if(!value.isNaN())
alleleList.add(value,count);
}
}
}
}
myData.setAttributeMap(perAlleleValues);
//check the alleles list
boolean foundRef = false;
for (final Allele a : myData.getAlleles()) {
if (a.isReference()) {
if (foundRef)
throw new GATKException("ERROR: multiple reference alleles found in annotation data\n");
foundRef = true;
}
}
if (!foundRef)
throw new GATKException("ERROR: no reference alleles found in annotation data\n");
myData.validateAllelesList();
}
@Override
@ -178,43 +202,76 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
}
final Map<String, Object> annotations = new HashMap<>();
final String annotationString = makeRawAnnotationString(vcAlleles, combinedData.getAttributeMap());
final String annotationString = makeCombinedAnnotationString(vcAlleles, combinedData.getAttributeMap());
annotations.put(getRawKeyName(), annotationString);
return annotations;
}
protected AlleleSpecificAnnotationData initializeNewAnnotationData(final List<Allele> vcAlleles) {
protected AlleleSpecificAnnotationData initializeNewRawAnnotationData(final List<Allele> vcAlleles) {
Map<Allele, CompressedDataList<Integer>> perAlleleValues = new HashMap<>();
for (Allele a : vcAlleles) {
perAlleleValues.put(a, new CompressedDataList<Integer>());
}
AlleleSpecificAnnotationData ret = new AlleleSpecificAnnotationData(vcAlleles, perAlleleValues.toString());
final AlleleSpecificAnnotationData ret = new AlleleSpecificAnnotationData(vcAlleles, perAlleleValues.toString());
ret.setAttributeMap(perAlleleValues);
return ret;
}
protected void combineAttributeMap(final ReducibleAnnotationData<CompressedDataList<Integer>> toAdd, final ReducibleAnnotationData<CompressedDataList<Integer>> combined) {
protected AlleleSpecificAnnotationData initializeNewAnnotationData(final List<Allele> vcAlleles) {
Map<Allele, Histogram> perAlleleValues = new HashMap<>();
for (Allele a : vcAlleles) {
perAlleleValues.put(a, new Histogram());
}
final AlleleSpecificAnnotationData ret = new AlleleSpecificAnnotationData(vcAlleles, perAlleleValues.toString());
ret.setAttributeMap(perAlleleValues);
return ret;
}
protected void combineAttributeMap(final ReducibleAnnotationData<Histogram> toAdd, final ReducibleAnnotationData<Histogram> combined) {
for (final Allele a : combined.getAlleles()) {
if (toAdd.hasAttribute(a)) {
final CompressedDataList<Integer> alleleData = combined.getAttribute(a);
alleleData.add(toAdd.getAttribute(a));
combined.putAttribute(a, alleleData);
final Histogram alleleData = combined.getAttribute(a);
if (toAdd.getAttribute(a) != null) {
alleleData.add(toAdd.getAttribute(a));
combined.putAttribute(a, alleleData);
}
}
}
}
protected String makeRawAnnotationString(final List<Allele> vcAlleles, final Map<Allele, CompressedDataList<Integer>> perAlleleValues) {
protected String makeRawAnnotationString(final List<Allele> vcAlleles, final Map<Allele, List<Double>> perAlleleValues) {
String annotationString = "";
for (int i =0; i< vcAlleles.size(); i++) {
if (i!=0)
annotationString += printDelim;
CompressedDataList<Integer> alleleValues = perAlleleValues.get(vcAlleles.get(i));
annotationString += alleleValues.toString();
for (int i = 0; i< vcAlleles.size(); i++) {
if (vcAlleles.get(i).isReference())
continue;
if (i != 0)
annotationString += printDelim;
final List<Double> alleleValue = perAlleleValues.get(vcAlleles.get(i));
//can be null if there are no ref reads
if (alleleValue == null)
continue;
annotationString += formatListAsString(alleleValue);
}
return annotationString;
}
protected String makeReducedAnnotationString(VariantContext vc, Map<Allele,Double> perAltRankSumResults) {
protected String makeCombinedAnnotationString(final List<Allele> vcAlleles, final Map<Allele, Histogram> perAlleleValues) {
String annotationString = "";
for (int i = 0; i< vcAlleles.size(); i++) {
if (vcAlleles.get(i).isReference())
continue;
if (i != 0)
annotationString += printDelim;
final Histogram alleleValue = perAlleleValues.get(vcAlleles.get(i));
//can be null if there are no ref reads
if (alleleValue == null)
continue;
annotationString += alleleValue.toString();
}
return annotationString;
}
private String makeReducedAnnotationString(final VariantContext vc, final Map<Allele,Double> perAltRankSumResults) {
String annotationString = "";
for (final Allele a : vc.getAlternateAlleles()) {
if (!annotationString.isEmpty())
@ -242,8 +299,8 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
return new HashMap<>();
final Map<String,Object> annotations = new HashMap<>();
final AlleleSpecificAnnotationData<CompressedDataList<Integer>> myData = new AlleleSpecificAnnotationData(originalVC.getAlleles(), rawRankSumData);
parseRawDataString(myData);
final AlleleSpecificAnnotationData<Histogram> myData = new AlleleSpecificAnnotationData(originalVC.getAlleles(), rawRankSumData);
parseCombinedDataString(myData);
final Map<Allele, Double> perAltRankSumResults = calculateReducedData(myData.getAttributeMap(), myData.getRefAllele());
//shortcut for no ref values
@ -254,27 +311,37 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
return annotations;
}
public void calculateRawData(VariantContext vc, Map<String, PerReadAlleleLikelihoodMap> pralm, ReducibleAnnotationData myData) {
public void calculateRawData(final VariantContext vc, final Map<String, PerReadAlleleLikelihoodMap> pralm, final ReducibleAnnotationData myData) {
if( vc.getGenotypes().getSampleNames().size() != 1)
throw new IllegalStateException("Calculating raw data for allele-specific rank sums requires variant context input with exactly one sample, as in a gVCF.");
if(pralm == null)
return;
final Map<Allele, CompressedDataList<Integer>> perAlleleValues = myData.getAttributeMap();
for ( final PerReadAlleleLikelihoodMap likelihoodMap : pralm.values() ) {
if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) {
fillQualsFromLikelihoodMap(vc.getAlleles(), vc.getStart(), likelihoodMap, perAlleleValues);
fillQualsFromLikelihoodMap(vc.getGenotype(0).getAlleles(), vc.getStart(), likelihoodMap, perAlleleValues);
}
}
}
/**
*
* @param alleles the alleles that were called in the genotype for this sample
* @param refLoc
* @param likelihoodMap
* @param perAlleleValues
*/
private void fillQualsFromLikelihoodMap(final List<Allele> alleles,
final int refLoc,
final PerReadAlleleLikelihoodMap likelihoodMap,
final Map<Allele, CompressedDataList<Integer>> perAlleleValues) {
for ( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet() ) {
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
if ( ! a.isInformative() )
continue; // read is non-informative
if ( ! a.isInformative() || ! alleles.contains(a.getMostLikelyAllele()))
continue; // read is non-informative or supports an allele that was not called
final GATKSAMRecord read = el.getKey();
if ( isUsableRead(read, refLoc) ) {
@ -289,8 +356,8 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
}
}
public Map<Allele, Double> calculateReducedData(final Map<Allele, CompressedDataList<Integer>> perAlleleValues, final Allele ref) {
final Map<Allele, Double> perAltRankSumResults = new HashMap<>();
public Map<Allele, List<Double>> calculateRankSum(final Map<Allele, CompressedDataList<Integer>> perAlleleValues, final Allele ref) {
final Map<Allele, List<Double>> perAltRankSumResults = new HashMap<>();
//shortcut to not try to calculate rank sum if there are no reads that unambiguously support the ref
if (perAlleleValues.get(ref).isEmpty())
return perAltRankSumResults;
@ -299,12 +366,12 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
continue;
final MannWhitneyU mannWhitneyU = new MannWhitneyU();
//load alts (series 1)
List<Double> alts = new ArrayList<>();
final List<Double> alts = new ArrayList<>();
for (final Number qual : perAlleleValues.get(alt)) {
alts.add((double) qual.intValue());
}
//load refs (series 2)
List<Double> refs = new ArrayList<>();
final List<Double> refs = new ArrayList<>();
for (final Number qual : perAlleleValues.get(ref)) {
refs.add((double) qual.intValue());
}
@ -322,9 +389,31 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
}
// we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases)
final MannWhitneyU.Result result = mannWhitneyU.test(convertToArray(alts), convertToArray(refs), MannWhitneyU.TestType.FIRST_DOMINATES);
perAltRankSumResults.put(alt, result.getZ());
perAltRankSumResults.put(alt, Collections.singletonList(result.getZ()));
}
return perAltRankSumResults;
}
public Map<Allele, Double> calculateReducedData(final Map<Allele, Histogram> perAlleleValues, final Allele ref) {
final Map<Allele, Double> perAltRankSumResults = new HashMap<>();
for (final Allele alt : perAlleleValues.keySet()) {
if (alt.equals(ref, false))
continue;
if (perAlleleValues.get(alt) != null)
perAltRankSumResults.put(alt, perAlleleValues.get(alt).median());
}
return perAltRankSumResults;
}
public String formatListAsString(final List<Double> rankSumValues) {
String formattedString = "";
for (int i=0; i<rankSumValues.size(); i++) {
if(i!=0)
formattedString += reducedDelim;
//we can get NaNs if one of the ref or alt lists is empty (e.g. homVar genotypes), but VQSR will process them appropriately downstream
formattedString += String.format("%.3f", rankSumValues.get(i));
}
return formattedString;
}
}

View File

@ -333,7 +333,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("5877ccbc99bbaffbcd5fe3aaa3d7e7f7"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("7e74286e2c412855509ea5312ea0ec57"));
spec.disableShadowBCF();
executeTest(" testAlleleSpecificAnnotations", spec);
}
@ -342,7 +342,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASMQMateRankSumAnnotation() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A AS_MQMateRankSumTest --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("0381fec3b0d21508b28fa62c2a61ccfc"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("e0b9135f68eb0b65cb36721dd3f40f00"));
spec.disableShadowBCF();
executeTest(" testASMQMateRankSumAnnotation", spec);
}
@ -351,7 +351,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testBetaTestingAnnotationGroup() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G BetaTesting --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("f248a6c4a7645dc5cc9f5ec9f81d9ad5"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("81f1fcddfec262a8aaedb9ea37d89873"));
spec.disableShadowBCF();
executeTest(" testASMQMateRankSumAnnotation", spec);
}
@ -360,7 +360,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASInsertSizeRankSum() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("4599a591427c188c117f09ac40cc866f"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("96934396683234559827a77c9bb38e21"));
spec.disableShadowBCF();
executeTest(" testASInsertSizeRankSum", spec);
}
@ -464,7 +464,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testHaplotypeCallerGVCSpanDel() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:26357667 -ERC GVCF --no_cmdline_in_header -A AS_ReadPosRankSumTest -A ReadPosRankSumTest -variant_index_type %s -variant_index_parameter %d",
b37KGReference, privateTestDir + "NexPond-377866-1:26357600-26357700.bam", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("bb12cf2dfa6f1fa0692395e295792584"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("db29a87113b358805e18ac8c20f9145d"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerGVCSpanDel", spec);
}

View File

@ -276,8 +276,8 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testAlleleSpecificAnnotations() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("f6a7fa62c33de963c55262820effe44a"));
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("f053aaa50427ea3fa9a43240e56eefc1"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}
@ -285,17 +285,17 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
@Test
public void testASMateRankSumAnnotation() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_MQMateRankSumTest -V "
+ privateTestDir + "NA12878.AS.MateRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.MateRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("2c264ed0057c93276c647f55998c4f25"));
+ privateTestDir + "NA12878.AS.MateRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.MateRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("951b59bf05eb4401da6a1bc8b2cdc261"));
spec.disableShadowBCF();
executeTest("testASMateRankSumAnnotation", spec);
}
@Test
public void testASInsertSizeRankSumAnnotation() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -V "
+ privateTestDir + "NA12878.AS.InsertSizeRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.InsertSizeRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("ff5ca958e81e406cfe010d5649b5c0d1"));
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_InsertSizeRankSum -V "
+ privateTestDir + "NA12878.AS.InsertSizeRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.InsertSizeRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("9df5f327028768e4c5d20e5742d6d0ad"));
spec.disableShadowBCF();
executeTest("testASInsertSizeRankSumAnnotation", spec);
}

View File

@ -598,8 +598,8 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
@Test
public void testAlleleSpecificAnnotations() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("29d6db0a93abd72d64fb1e82da65c715"));
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("f8ea6b63ea0bf7b0fdc266845762b927"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}
@ -607,17 +607,17 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
@Test
public void testASMateRankSumAnnotation() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_MQMateRankSumTest --disableDithering -V "
+ privateTestDir + "NA12878.AS.MateRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.MateRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("2a330015a7db9f9aee9bc5b776698f73"));
+ privateTestDir + "NA12878.AS.MateRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.MateRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("a3fc424534c31b43bdaaa4de3803f73b"));
spec.disableShadowBCF();
executeTest("testASMateRankSumAnnotation", spec);
}
@Test
public void testASInsertSizeRankSumAnnotation() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "NA12878.AS.InsertSizeRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.InsertSizeRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("75aee1e0c8c3528180e344ec6c0d8ffd"));
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_InsertSizeRankSum --disableDithering -V "
+ privateTestDir + "NA12878.AS.InsertSizeRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.InsertSizeRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("55e14518c955de9158d98e55da75b643"));
spec.disableShadowBCF();
executeTest("testASInsertSizeRankSumAnnotation", spec);
}
@ -630,19 +630,19 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations_oneSample() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("f4fa3acec2b21037368898e913b7a3fa"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("78c3ed48b17faebf77f4cc9f225b9045"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations_oneSample", spec);
}
@Test
//do at least 10 samples so InbreedingCoeff and AS_InbreedingCoeff are output
public void testAlleleSpecificAnnotations_elevenSamples() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
public void testAlleleSpecificAnnotations_moreSamples() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_InsertSizeRankSum -A AS_MQMateRankSumTest --disableDithering -V "
+ privateTestDir + "multiSamples.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("4e90f6908248fac9b3ce3e545180a8e5"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("5b007393f598cde9f18b03d4ac5cab25"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations_elevenSamples", spec);
executeTest("testAlleleSpecificAnnotations_moreSamples", spec);
}
@Test
@ -658,7 +658,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testFractionInformativeReads() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -G AS_Standard -o %s --no_cmdline_in_header -A FractionInformativeReads --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("0b1bbcc7d24f8b0945c97907b1cdd974"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("e5db4edf2d2b2abdb0b07a57d658785b"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}

View File

@ -0,0 +1,143 @@
/*
* Copyright 2012-2016 Broad Institute, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import java.util.Arrays;
/**
* Created by gauthier on 11/1/16.
*/
public class Histogram {
private Double binSize;
private String precisionFormat;
private String printDelim;
final private Double BIN_EPSILON = 0.01;
private CompressedDataList<Integer> dataList = new CompressedDataList<>();
public Histogram() {
this.binSize = 0.1;
precisionFormat = "%.1f";
}
public Histogram(final Double binSize) {
this.binSize = binSize;
precisionFormat = "%." + Math.round(-Math.log10(binSize)) + "f";
}
public void add(final Double d) {
long binKey = getBinnedValue(d);
if (isValidBinKey(binKey))
dataList.add((int)binKey);
else
throw new GATKException("Histogram values are suspiciously extreme. Failed to add " + d + " to the Histogram.");
}
public void add(final Double d, final int count) {
if (count < 1)
throw new GATKException("Cannot add non-positive counts to Histogram.");
long binKey = getBinnedValue(d);
if (isValidBinKey(binKey))
dataList.add((int)binKey, count);
else
throw new GATKException("Histogram values are suspiciously extreme. Failed to add " + d + " to the Histogram.");
}
public void add(final Histogram h) {
if (!this.binSize.equals(h.binSize))
throw new GATKException("Histogram bin sizes are mismatched -- cannot add bin size " + this.binSize + " to " + h.binSize);
this.dataList.add(h.dataList);
}
public Integer get(final Double d) {
long binKey = getBinnedValue(d);
if (isValidBinKey(binKey))
return dataList.getValueCounts().get((int)binKey);
else
throw new GATKException("Requested value is suspiciously extreme. Failed to retrieve " + d + " from the Histogram.");
}
/**
*
* @return may be null if Histogram is empty
*/
public Double median() {
int numItems = 0;
for(final int count : dataList.valueCounts.values()) {
numItems += count;
}
boolean oddNumberValues = true;
if(numItems % 2 == 0)
oddNumberValues = false;
int medianIndex = (numItems+1)/2;
int counter = 0;
Double firstMedian = null;
for(final Integer key : dataList.valueCounts.keySet()) {
counter += dataList.valueCounts.get(key);
if( counter > medianIndex) {
if (firstMedian == null)
return key*binSize;
else {
return (firstMedian+key)/2.0*binSize;
}
}
if( counter == medianIndex) {
if (oddNumberValues)
return key*binSize;
else {
firstMedian = (double) key;
}
}
}
return null;
}
private long getBinnedValue(double d) {
return Math.round(Math.floor((d+BIN_EPSILON*binSize)/binSize)); //add a little epsilon before division so values exactly on bin boundaries will stay in the same bin
}
private boolean isValidBinKey(long binnedValue) {
return binnedValue <= Integer.MAX_VALUE && binnedValue >= Integer.MIN_VALUE;
}
@Override
public String toString(){
printDelim = ",";
String str = "";
Object[] keys = dataList.valueCounts.keySet().toArray();
Arrays.sort(keys);
for (Object i: keys){
if(!str.isEmpty())
str+=printDelim;
str+=(String.format(precisionFormat,(double)(int)i*binSize)+printDelim+dataList.valueCounts.get(i)); //key i needs to be output with specified precision
}
return str;
}
}

View File

@ -26,6 +26,7 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import java.util.*;
@ -102,4 +103,17 @@ public class ReducibleAnnotationData<T> {
*/
public Map<Allele, T> getAttributeMap() {return Collections.unmodifiableMap(attributeMap);}
public void validateAllelesList() {
boolean foundRef = false;
for (final Allele a : this.getAlleles()) {
if (a.isReference()) {
if (foundRef)
throw new GATKException("ERROR: multiple reference alleles found in annotation data\n");
foundRef = true;
}
}
if (!foundRef)
throw new GATKException("ERROR: no reference alleles found in annotation data\n");
}
}

View File

@ -0,0 +1,103 @@
/*
* Copyright 2012-2016 Broad Institute, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.testng.Assert;
import org.testng.annotations.Test;
/**
* Created by gauthier on 11/1/16.
*/
public class HistogramUnitTest {
private final Double EPSILON = 0.001;
@Test
public void testAdd() throws Exception {
Histogram bimodalHist = new Histogram();
for(int i=0; i<=100; i++) {
bimodalHist.add(1+i/1000.0);
}
Assert.assertEquals(bimodalHist.get(1.0), new Integer(100), "");
Assert.assertEquals(bimodalHist.get(1.1), new Integer(1), "");
}
@Test
public void testAddingQuantizedValues() throws Exception {
Histogram hist = new Histogram();
for(int i=0; i<100; i++) {
hist.add(1.2);
}
Assert.assertEquals(hist.get(1.2), new Integer(100));
Assert.assertEquals(hist.median(), 1.2, EPSILON);
}
@Test
public void testBulkAdd() throws Exception {
Histogram bimodalHist = new Histogram();
for(int i=0; i<=100; i++) {
bimodalHist.add(1+i/1000.0, 2);
}
Assert.assertEquals(bimodalHist.get(1.0), new Integer(200), "");
Assert.assertEquals(bimodalHist.get(1.1), new Integer(2), "");
}
@Test
public void testMedianOfEvens() throws Exception {
Histogram bimodalHist = new Histogram();
for(int i = 0; i<10; i++) {
bimodalHist.add(10.0);
bimodalHist.add(20.0);
}
Assert.assertEquals(bimodalHist.median(), 15.0, EPSILON, "");
}
@Test
public void testMedianOfOdds() throws Exception {
Histogram bimodalHist = new Histogram();
for(int i = 0; i<10; i++) {
bimodalHist.add(10.0);
bimodalHist.add(20.0);
}
bimodalHist.add(20.0);
Assert.assertEquals(bimodalHist.median(), 20.0, EPSILON, "");
}
@Test
public void testMedianOfEmptyHist() throws Exception {
Histogram empty = new Histogram();
Assert.assertNull(empty.median());
}
@Test
public void testMedianOfSingleItem() throws Exception {
Histogram singleItem = new Histogram();
singleItem.add(20.0);
Assert.assertEquals(singleItem.median(), 20.0, EPSILON, "");
}
}