Merge pull request #1610 from broadinstitute/ldg_fixRankSumHistogramOutput2

Changed ASRankSum single-sample output so it's combine-able
This commit is contained in:
ldgauthier 2017-07-18 12:51:24 -04:00 committed by GitHub
commit 97ed131abd
5 changed files with 48 additions and 66 deletions

View File

@ -113,47 +113,13 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
final Map<String, Object> annotations = new HashMap<>();
final AlleleSpecificAnnotationData<CompressedDataList<Integer>> myRawData = initializeNewRawAnnotationData(vc.getAlleles());
calculateRawData(vc, perReadAlleleLikelihoodMap, myRawData);
Map<Allele, List<Double>> myRankSumStats = calculateRankSum(myRawData.getAttributeMap(), myRawData.getRefAllele());
Map<Allele, Double> myRankSumStats = calculateRankSum(myRawData.getAttributeMap(), myRawData.getRefAllele());
final String annotationString = makeRawAnnotationString(vc.getAlleles(), myRankSumStats);
annotations.put(getRawKeyName(), annotationString);
return annotations;
}
protected void parseRawDataString(final ReducibleAnnotationData<Histogram> myData) {
final String rawDataString = myData.getRawData();
String rawDataNoBrackets;
final Map<Allele, Histogram> perAlleleValues = new HashMap<>();
//Initialize maps
for (final Allele current : myData.getAlleles()) {
perAlleleValues.put(current, new Histogram());
}
//Map gives back list with []
if (rawDataString.charAt(0) == '[') {
rawDataNoBrackets = rawDataString.substring(1, rawDataString.length() - 1);
}
else {
rawDataNoBrackets = rawDataString;
}
//rawDataPerAllele is a per-sample list of the rank sum statistic for each allele
final String[] rawDataPerAllele = rawDataNoBrackets.split(splitDelim);
for (int i=0; i<rawDataPerAllele.length; i++) {
final String alleleData = rawDataPerAllele[i];
final Histogram alleleList = perAlleleValues.get(myData.getAlleles().get(i));
final String[] rawListEntriesAsStringVector = alleleData.split(",");
for (int j=0; j<rawListEntriesAsStringVector.length; j++) {
Double value;
if (!rawListEntriesAsStringVector[j].isEmpty()) {
value = Double.parseDouble(rawListEntriesAsStringVector[j].trim());
if(!value.isNaN())
alleleList.add(value);
}
}
}
myData.setAttributeMap(perAlleleValues);
myData.validateAllelesList();
}
protected void parseCombinedDataString(final ReducibleAnnotationData<Histogram> myData) {
final String rawDataString = myData.getRawData();
String rawDataNoBrackets;
final Map<Allele, Histogram> perAlleleValues = new HashMap<>();
@ -179,9 +145,10 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
int count;
if (!rawListEntriesAsStringVector[j].isEmpty()) {
value = Double.parseDouble(rawListEntriesAsStringVector[j].trim());
if(value.isNaN())
continue;
if (!rawListEntriesAsStringVector[j + 1].isEmpty()) {
count = Integer.parseInt(rawListEntriesAsStringVector[j + 1].trim());
if(!value.isNaN())
alleleList.add(value,count);
}
}
@ -239,18 +206,18 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
}
}
protected String makeRawAnnotationString(final List<Allele> vcAlleles, final Map<Allele, List<Double>> perAlleleValues) {
protected String makeRawAnnotationString(final List<Allele> vcAlleles, final Map<Allele, Double> perAlleleValues) {
String annotationString = "";
for (int i = 0; i< vcAlleles.size(); i++) {
if (vcAlleles.get(i).isReference())
continue;
if (i != 0)
if (i != 0) //strings will always start with a printDelim because we won't have values for the reference allele, but keep this for consistency with other annotations
annotationString += printDelim;
final List<Double> alleleValue = perAlleleValues.get(vcAlleles.get(i));
final Double alleleValue = perAlleleValues.get(vcAlleles.get(i));
//can be null if there are no ref reads
if (alleleValue == null)
continue;
annotationString += formatListAsString(alleleValue);
annotationString += outputSingletonValueAsHistogram(alleleValue);
}
return annotationString;
}
@ -260,11 +227,11 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
for (int i = 0; i< vcAlleles.size(); i++) {
if (vcAlleles.get(i).isReference())
continue;
if (i != 0)
if (i != 0) //strings will always start with a printDelim because we won't have values for the reference allele, but keep this for consistency with other annotations
annotationString += printDelim;
final Histogram alleleValue = perAlleleValues.get(vcAlleles.get(i));
//can be null if there are no ref reads
if (alleleValue == null)
//can be empty if there are no ref reads
if (alleleValue.isEmpty())
continue;
annotationString += alleleValue.toString();
}
@ -300,7 +267,7 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
final Map<String,Object> annotations = new HashMap<>();
final AlleleSpecificAnnotationData<Histogram> myData = new AlleleSpecificAnnotationData(originalVC.getAlleles(), rawRankSumData);
parseCombinedDataString(myData);
parseRawDataString(myData);
final Map<Allele, Double> perAltRankSumResults = calculateReducedData(myData.getAttributeMap(), myData.getRefAllele());
//shortcut for no ref values
@ -356,8 +323,8 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
}
}
public Map<Allele, List<Double>> calculateRankSum(final Map<Allele, CompressedDataList<Integer>> perAlleleValues, final Allele ref) {
final Map<Allele, List<Double>> perAltRankSumResults = new HashMap<>();
public Map<Allele, Double> calculateRankSum(final Map<Allele, CompressedDataList<Integer>> perAlleleValues, final Allele ref) {
final Map<Allele, Double> perAltRankSumResults = new HashMap<>();
//shortcut to not try to calculate rank sum if there are no reads that unambiguously support the ref
if (perAlleleValues.get(ref).isEmpty())
return perAltRankSumResults;
@ -389,7 +356,7 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
}
// we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases)
final MannWhitneyU.Result result = mannWhitneyU.test(convertToArray(alts), convertToArray(refs), MannWhitneyU.TestType.FIRST_DOMINATES);
perAltRankSumResults.put(alt, Collections.singletonList(result.getZ()));
perAltRankSumResults.put(alt, result.getZ());
}
return perAltRankSumResults;
}
@ -410,10 +377,17 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
for (int i=0; i<rankSumValues.size(); i++) {
if(i!=0)
formattedString += reducedDelim;
//we can get NaNs if one of the ref or alt lists is empty (e.g. homVar genotypes), but VQSR will process them appropriately downstream
//we can get NaNs if one of the ref or alt lists is empty (e.g. homVar genotypes)
if (!rankSumValues.get(i).isNaN())
formattedString += String.format("%.3f", rankSumValues.get(i));
}
return formattedString;
}
public String outputSingletonValueAsHistogram(final Double rankSumValue) {
Histogram h = new Histogram();
h.add(rankSumValue);
return h.toString();
}
}

View File

@ -330,7 +330,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("d2031195a42cf857e4798090c15f9712"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("45347695ff437a9615163e9f3bcf8320"));
spec.disableShadowBCF();
executeTest(" testAlleleSpecificAnnotations", spec);
}
@ -339,7 +339,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASMQMateRankSumAnnotation() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A AS_MQMateRankSumTest --disableDithering",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("5ffa2b108046274aaf3180ea10bda8f5"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("5ca7ed6a7ff152b04bb2a7a97607091a"));
spec.disableShadowBCF();
executeTest(" testASMQMateRankSumAnnotation", spec);
}
@ -348,7 +348,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testBetaTestingAnnotationGroup() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G BetaTesting --disableDithering",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("1231cc5e3a73cb0fb97c26011ce4881a"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("a1677568cbd9304df927feea8a23085b"));
spec.disableShadowBCF();
executeTest(" testASMQMateRankSumAnnotation", spec);
}
@ -357,7 +357,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASInsertSizeRankSum() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("173347a78bafd67fffbef47144326c78"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("7809baf72434fdfa94e8e5862596e36b"));
spec.disableShadowBCF();
executeTest(" testASInsertSizeRankSum", spec);
}
@ -461,7 +461,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testHaplotypeCallerGVCSpanDel() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L 1:26357667 -ERC GVCF --no_cmdline_in_header -A AS_ReadPosRankSumTest -A ReadPosRankSumTest -variant_index_type %s -variant_index_parameter %d",
b37KGReference, privateTestDir + "NexPond-377866-1:26357600-26357700.bam", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("e880bd94005a195e354c0ee811213a77"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("5b1b75afcb1580df25a1cd7099ce050b"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerGVCSpanDel", spec);
}

View File

@ -279,7 +279,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("26606fbfc3e9e5a8813307227d898b9a"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("43196128ec5876b7f352c48e28e971b7"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}
@ -300,7 +300,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
public void testASMateRankSumAnnotation() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_MQMateRankSumTest -V "
+ privateTestDir + "NA12878.AS.MateRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.MateRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("9fd72e0f1f0f08a5aff6875225eec567"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("d93b649f38b490f3f234cd50bd4a3a3b"));
spec.disableShadowBCF();
executeTest("testASMateRankSumAnnotation", spec);
}
@ -309,7 +309,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
public void testASInsertSizeRankSumAnnotation() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_InsertSizeRankSum -V "
+ privateTestDir + "NA12878.AS.InsertSizeRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.InsertSizeRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("b8e10684010702d11ee71538b1f201ac"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("eed25cfb0fa4e05ff64a1ce96eecaeae"));
spec.disableShadowBCF();
executeTest("testASInsertSizeRankSumAnnotation", spec);
}

View File

@ -599,7 +599,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("6e24254a6903b7280f1d2f8e0b62ff65"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("f8f65bf122898d5777db0bce152eed27"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}
@ -619,7 +619,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testASMateRankSumAnnotation() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_MQMateRankSumTest --disableDithering -V "
+ privateTestDir + "NA12878.AS.MateRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.MateRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("03e1188bf5bdd196b392b3c56d33112f"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("1246f62a9dc7ce2ca1913e8c29e748b1"));
spec.disableShadowBCF();
executeTest("testASMateRankSumAnnotation", spec);
}
@ -628,7 +628,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testASInsertSizeRankSumAnnotation() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_InsertSizeRankSum --disableDithering -V "
+ privateTestDir + "NA12878.AS.InsertSizeRankSum.chr20snippet.g.vcf -V " + privateTestDir + "NA12892.AS.InsertSizeRankSum.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("1ca5f4c67340a3087159f29cc31772b8"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("20317589428d765526798aabb9129fa3"));
spec.disableShadowBCF();
executeTest("testASInsertSizeRankSumAnnotation", spec);
}
@ -641,7 +641,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations_oneSample() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("d2d59eac6879bfc54ce32f46e9ac9dad"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("7794512dee90a6b47cf2a932e121ff72"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations_oneSample", spec);
}
@ -651,7 +651,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations_moreSamples() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard -A AS_InsertSizeRankSum -A AS_MQMateRankSumTest --disableDithering -V "
+ privateTestDir + "multiSamples.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("8cc75a852b59145721163300e1be8392"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("87284eff9cbc39f84d2e5c5492fd4f80"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations_moreSamples", spec);
}
@ -669,7 +669,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testFractionInformativeReads() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -G AS_Standard -o %s --no_cmdline_in_header -A FractionInformativeReads --disableDithering -V "
+ privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("9245c5abfba583dc6532fd82c6815da2"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("20ab6b89ab72fc2da8ac7a2db31cb15c"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}

View File

@ -51,6 +51,8 @@ public class Histogram {
}
public void add(final Double d) {
if (d.isNaN())
return;
long binKey = getBinnedValue(d);
if (isValidBinKey(binKey))
dataList.add((int) binKey);
@ -132,6 +134,8 @@ public class Histogram {
printDelim = ",";
String str = "";
Object[] keys = dataList.valueCounts.keySet().toArray();
if (keys.length == 0)
return Double.toString(Double.NaN);
Arrays.sort(keys);
for (Object i: keys){
if(!str.isEmpty())
@ -140,4 +144,8 @@ public class Histogram {
}
return str;
}
public boolean isEmpty() {
return dataList.isEmpty();
}
}