Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Matt Hanna 2011-09-23 14:53:28 -04:00
commit e388c357ca
42 changed files with 889 additions and 685 deletions

View File

@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.examples;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -59,6 +60,7 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
* @author Your Name
* @since Date created
*/
@Hidden
public class GATKDocsExample extends RodWalker<Integer, Integer> {
/**
* Put detailed documentation about the argument here. No need to duplicate the summary information

View File

@ -43,6 +43,9 @@ import java.util.List;
import java.util.Map;
/**
* The allele balance (fraction of ref bases over ref + alt bases) across all bialleleic het-called samples
*/
public class AlleleBalance extends InfoFieldAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -16,6 +16,9 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/**
* The allele balance (fraction of ref bases over ref + alt bases) separately for each bialleleic het-called sample
*/
public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {

View File

@ -6,8 +6,9 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype;
import java.util.Map;
/**
* Abstract base class for all annotations that are normalized by depth
*/
public abstract class AnnotationByDepth extends InfoFieldAnnotation {

View File

@ -47,6 +47,9 @@ import java.util.List;
import java.util.Map;
/**
* Count of A, C, G, T bases across all samples
*/
public class BaseCounts extends InfoFieldAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -13,6 +13,9 @@ import java.util.LinkedHashMap;
import java.util.List;
/**
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele)
*/
public class BaseQualityRankSumTest extends RankSumTest {
public List<String> getKeyNames() { return Arrays.asList("BaseQRankSum"); }

View File

@ -44,6 +44,11 @@ import java.util.List;
import java.util.Map;
/**
* Allele count in genotypes, for each ALT allele, in the same order as listed;
* allele Frequency, for each ALT allele, in the same order as listed; total number
* of alleles in called genotypes.
*/
public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation {
private String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY };

View File

@ -16,7 +16,12 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Total (unfiltered) depth over all samples.
*
* Affected by downsampling (-dcov) though, so the max value one can obtain for N samples with -dcov D
* is N * D
*/
public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -23,6 +23,25 @@ import java.util.List;
import java.util.Map;
/**
* The depth of coverage of each VCF allele in this sample
*
* Complementary fields that two important ways of thinking about the depth of the data for this sample
* at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal
* quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site.
* The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the
* REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
* power I have to determine the genotype of the sample at this site, while the AD tells me how many times
* I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
* the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
* to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
* normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that
* the AD isn't necessarily calculated exactly for indels (it counts as non-reference only those indels that
* are actually present and correctly left-aligned in the alignments themselves). Because of this fact and
* because the AD includes reads and bases that were filtered by the Unified Genotyper, one should not base
* assumptions about the underlying genotype based on it; instead, the genotype likelihoods (PLs) are what
* determine the genotype calls (see below).
*/
public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {
private static String REF_ALLELE = "REF";

View File

@ -43,6 +43,11 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/**
* Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation
* being seen on only the forward or only the reverse strand) in the reads? More bias is
* indicative of false positive calls.
*/
public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation {
private static final String FS = "FS";
private static final double MIN_PVALUE = 1E-320;

View File

@ -17,6 +17,9 @@ import java.util.List;
import java.util.Map;
/**
* The GC content (# GC bases / # all bases) of the reference within 50 bp +/- this site
*/
public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -49,6 +49,10 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/**
* Consistency of the site with two (and only two) segregating haplotypes. Higher scores
* are indicative of regions with bad alignments, often leading to artifactual SNP and indel calls.
*/
public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation {
private final static boolean DEBUG = false;
private final static int MIN_CONTEXT_WING_SIZE = 10;

View File

@ -19,6 +19,9 @@ import java.util.List;
import java.util.Map;
/**
* Phred-scaled P value of genotype-based (using GT field) test for Hardy-Weinberg test for disequilibrium
*/
public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgressAnnotation {
private static final int MIN_SAMPLES = 10;

View File

@ -16,7 +16,9 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Largest contiguous homopolymer run of the variant allele in either direction on the reference.
*/
public class HomopolymerRun extends InfoFieldAnnotation implements StandardAnnotation {
private boolean ANNOTATE_INDELS = true;

View File

@ -17,14 +17,15 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 5/16/11
*/
// A set of annotations calculated directly from the GLs
public class GLstats extends InfoFieldAnnotation implements StandardAnnotation {
/**
* Likelihood-based (using PL field) test for the inbreeding among samples.
*
* A continuous generalization of the Hardy-Weinberg test for disequilibrium that works
* well with limited coverage per sample. See the 1000 Genomes Phase I release for
* more information.
*/
public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation {
private static final int MIN_SAMPLES = 10;

View File

@ -14,11 +14,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: Mar 11, 2011
* Time: 11:47:33 AM
* To change this template use File | Settings | File Templates.
* Rough category of indel type (insertion, deletion, multi-allelic, other)
*/
public class IndelType extends InfoFieldAnnotation implements ExperimentalAnnotation {

View File

@ -17,6 +17,9 @@ import java.util.List;
import java.util.Map;
/**
* Triplet annotation: fraction of MAQP == 0, MAPQ < 10, and count of all mapped reads
*/
public class LowMQ extends InfoFieldAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -14,6 +14,9 @@ import java.util.LinkedHashMap;
import java.util.List;
/**
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele)
*/
public class MappingQualityRankSumTest extends RankSumTest {
public List<String> getKeyNames() { return Arrays.asList("MQRankSum"); }

View File

@ -19,6 +19,9 @@ import java.util.List;
import java.util.Map;
/**
* Total count across all samples of mapping quality zero reads
*/
public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -1,85 +1,81 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Feb 4, 2011
* Time: 6:46:25 PM
* To change this template use File | Settings | File Templates.
*/
public class MappingQualityZeroBySample extends GenotypeAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker,
AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext context, VariantContext vc, Genotype g) {
if ( g == null || !g.isCalled() )
return null;
int mq0 = 0;
ReadBackedPileup pileup = null;
if (vc.isIndel() && context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
else return null;
if (pileup != null) {
for (PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 )
mq0++;
}
}
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%d", mq0));
return map;
}
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.MAPPING_QUALITY_ZERO_KEY); }
public List<VCFFormatHeaderLine> getDescriptions() { return Arrays.asList(
new VCFFormatHeaderLine(getKeyNames().get(0), 1,
VCFHeaderLineType.Integer, "Number of Mapping Quality Zero Reads per sample")); }
}
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Count for each sample of mapping quality zero reads
*/
public class MappingQualityZeroBySample extends GenotypeAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker,
AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext context, VariantContext vc, Genotype g) {
if ( g == null || !g.isCalled() )
return null;
int mq0 = 0;
ReadBackedPileup pileup = null;
if (vc.isIndel() && context.hasExtendedEventPileup())
pileup = context.getExtendedEventPileup();
else if (context.hasBasePileup())
pileup = context.getBasePileup();
else return null;
if (pileup != null) {
for (PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 )
mq0++;
}
}
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%d", mq0));
return map;
}
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.MAPPING_QUALITY_ZERO_KEY); }
public List<VCFFormatHeaderLine> getDescriptions() { return Arrays.asList(
new VCFFormatHeaderLine(getKeyNames().get(0), 1,
VCFHeaderLineType.Integer, "Number of Mapping Quality Zero Reads per sample")); }
}

View File

@ -17,8 +17,9 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Fraction of all reads across samples that have mapping quality zero
*/
public class MappingQualityZeroFraction extends InfoFieldAnnotation implements ExperimentalAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -17,11 +17,8 @@ import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 5/16/11
* The number of N bases, counting only SOLiD data
*/
public class NBaseCount extends InfoFieldAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if( stratifiedContexts.size() == 0 )

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -15,7 +16,11 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Variant confidence (given as (AB+BB)/AA from the PLs) / unfiltered depth.
*
* Low scores are indicative of false positive calls and artifacts.
*/
public class QualByDepth extends AnnotationByDepth implements StandardAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -21,6 +21,9 @@ import java.util.List;
import java.util.Map;
/**
* Root Mean Square of the mapping quality of the reads across all samples.
*/
public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -21,7 +21,9 @@ import java.util.List;
import java.util.Map;
/**
* Abstract root for all RankSum based annotations
*/
public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation {
static final double INDEL_LIKELIHOOD_THRESH = 0.1;
static final boolean DEBUG = false;

View File

@ -1,209 +1,207 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Feb 4, 2011
* Time: 3:59:27 PM
* To change this template use File | Settings | File Templates.
*/
public class ReadDepthAndAllelicFractionBySample extends GenotypeAnnotation {
private static String REF_ALLELE = "REF";
private static String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref,
AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
if ( g == null || !g.isCalled() )
return null;
if ( vc.isSNP() )
return annotateSNP(stratifiedContext, vc);
if ( vc.isIndel() )
return annotateIndel(stratifiedContext, vc);
return null;
}
private Map<String,Object> annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) {
if ( ! stratifiedContext.hasBasePileup() ) return null;
HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
for ( Allele allele : vc.getAlternateAlleles() )
alleleCounts.put(allele.getBases()[0], 0);
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
int totalDepth = pileup.size();
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), totalDepth); // put total depth in right away
if ( totalDepth == 0 ) return map; // done, can not compute FA at 0 coverage!!
int mq0 = 0; // number of "ref" reads that are acually mq0
for ( PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 ) {
mq0++;
continue;
}
if ( alleleCounts.containsKey(p.getBase()) ) // non-mq0 read and it's an alt
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
}
if ( mq0 == totalDepth ) return map; // if all reads are mq0, there is nothing left to do
// we need to add counts in the correct order
String[] fracs = new String[alleleCounts.size()];
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
fracs[i] = String.format("%.3f", ((float)alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]))/(totalDepth-mq0));
}
map.put(getKeyNames().get(1), fracs);
return map;
}
private Map<String,Object> annotateIndel(AlignmentContext
stratifiedContext, VariantContext
vc) {
if ( ! stratifiedContext.hasExtendedEventPileup() ) {
return null;
}
ReadBackedExtendedEventPileup pileup = stratifiedContext.getExtendedEventPileup();
if ( pileup == null )
return null;
int totalDepth = pileup.size();
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), totalDepth); // put total depth in right away
if ( totalDepth == 0 ) return map;
int mq0 = 0; // number of "ref" reads that are acually mq0
HashMap<String, Integer> alleleCounts = new HashMap<String, Integer>();
Allele refAllele = vc.getReference();
for ( Allele allele : vc.getAlternateAlleles() ) {
if ( allele.isNoCall() ) {
continue; // this does not look so good, should we die???
}
alleleCounts.put(getAlleleRepresentation(allele), 0);
}
for ( ExtendedEventPileupElement e : pileup.toExtendedIterable() ) {
if ( e.getMappingQual() == 0 ) {
mq0++;
continue;
}
if ( e.isInsertion() ) {
final String b = e.getEventBases();
if ( alleleCounts.containsKey(b) ) {
alleleCounts.put(b, alleleCounts.get(b)+1);
}
} else {
if ( e.isDeletion() ) {
if ( e.getEventLength() == refAllele.length() ) {
// this is indeed the deletion allele recorded in VC
final String b = DEL;
if ( alleleCounts.containsKey(b) ) {
alleleCounts.put(b, alleleCounts.get(b)+1);
}
}
// else {
// System.out.print(" deletion of WRONG length found");
// }
}
}
}
if ( mq0 == totalDepth ) return map;
String[] fracs = new String[alleleCounts.size()];
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
fracs[i] = String.format("%.3f",
((float)alleleCounts.get(getAlleleRepresentation(vc.getAlternateAllele(i))))/(totalDepth-mq0));
map.put(getKeyNames().get(1), fracs);
//map.put(getKeyNames().get(0), counts);
return map;
}
private String getAlleleRepresentation(Allele allele) {
if ( allele.isNull() ) { // deletion wrt the ref
return DEL;
} else { // insertion, pass actual bases
return allele.getBaseString();
}
}
// public String getIndelBases()
public List<String> getKeyNames() { return Arrays.asList("DP","FA"); }
public List<VCFFormatHeaderLine> getDescriptions() {
return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0),
1,
VCFHeaderLineType.Integer,
"Total read depth per sample, including MQ0"),
new VCFFormatHeaderLine(getKeyNames().get(1),
VCFHeaderLineCount.UNBOUNDED,
VCFHeaderLineType.Float,
"Fractions of reads (excluding MQ0 from both ref and alt) supporting each reported alternative allele, per sample"));
}
}
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Unsupported
*/
@Hidden
public class ReadDepthAndAllelicFractionBySample extends GenotypeAnnotation {
private static String REF_ALLELE = "REF";
private static String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref,
AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
if ( g == null || !g.isCalled() )
return null;
if ( vc.isSNP() )
return annotateSNP(stratifiedContext, vc);
if ( vc.isIndel() )
return annotateIndel(stratifiedContext, vc);
return null;
}
private Map<String,Object> annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) {
if ( ! stratifiedContext.hasBasePileup() ) return null;
HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
for ( Allele allele : vc.getAlternateAlleles() )
alleleCounts.put(allele.getBases()[0], 0);
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
int totalDepth = pileup.size();
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), totalDepth); // put total depth in right away
if ( totalDepth == 0 ) return map; // done, can not compute FA at 0 coverage!!
int mq0 = 0; // number of "ref" reads that are acually mq0
for ( PileupElement p : pileup ) {
if ( p.getMappingQual() == 0 ) {
mq0++;
continue;
}
if ( alleleCounts.containsKey(p.getBase()) ) // non-mq0 read and it's an alt
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
}
if ( mq0 == totalDepth ) return map; // if all reads are mq0, there is nothing left to do
// we need to add counts in the correct order
String[] fracs = new String[alleleCounts.size()];
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
fracs[i] = String.format("%.3f", ((float)alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]))/(totalDepth-mq0));
}
map.put(getKeyNames().get(1), fracs);
return map;
}
private Map<String,Object> annotateIndel(AlignmentContext
stratifiedContext, VariantContext
vc) {
if ( ! stratifiedContext.hasExtendedEventPileup() ) {
return null;
}
ReadBackedExtendedEventPileup pileup = stratifiedContext.getExtendedEventPileup();
if ( pileup == null )
return null;
int totalDepth = pileup.size();
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), totalDepth); // put total depth in right away
if ( totalDepth == 0 ) return map;
int mq0 = 0; // number of "ref" reads that are acually mq0
HashMap<String, Integer> alleleCounts = new HashMap<String, Integer>();
Allele refAllele = vc.getReference();
for ( Allele allele : vc.getAlternateAlleles() ) {
if ( allele.isNoCall() ) {
continue; // this does not look so good, should we die???
}
alleleCounts.put(getAlleleRepresentation(allele), 0);
}
for ( ExtendedEventPileupElement e : pileup.toExtendedIterable() ) {
if ( e.getMappingQual() == 0 ) {
mq0++;
continue;
}
if ( e.isInsertion() ) {
final String b = e.getEventBases();
if ( alleleCounts.containsKey(b) ) {
alleleCounts.put(b, alleleCounts.get(b)+1);
}
} else {
if ( e.isDeletion() ) {
if ( e.getEventLength() == refAllele.length() ) {
// this is indeed the deletion allele recorded in VC
final String b = DEL;
if ( alleleCounts.containsKey(b) ) {
alleleCounts.put(b, alleleCounts.get(b)+1);
}
}
// else {
// System.out.print(" deletion of WRONG length found");
// }
}
}
}
if ( mq0 == totalDepth ) return map;
String[] fracs = new String[alleleCounts.size()];
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
fracs[i] = String.format("%.3f",
((float)alleleCounts.get(getAlleleRepresentation(vc.getAlternateAllele(i))))/(totalDepth-mq0));
map.put(getKeyNames().get(1), fracs);
//map.put(getKeyNames().get(0), counts);
return map;
}
private String getAlleleRepresentation(Allele allele) {
if ( allele.isNull() ) { // deletion wrt the ref
return DEL;
} else { // insertion, pass actual bases
return allele.getBaseString();
}
}
// public String getIndelBases()
public List<String> getKeyNames() { return Arrays.asList("DP","FA"); }
public List<VCFFormatHeaderLine> getDescriptions() {
return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0),
1,
VCFHeaderLineType.Integer,
"Total read depth per sample, including MQ0"),
new VCFFormatHeaderLine(getKeyNames().get(1),
VCFHeaderLineCount.UNBOUNDED,
VCFHeaderLineType.Float,
"Fractions of reads (excluding MQ0 from both ref and alt) supporting each reported alternative allele, per sample"));
}
}

View File

@ -19,11 +19,8 @@ import java.util.LinkedHashMap;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 3/30/11
* The phred-scaled p-value (u-based z-approximation) from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error).
*/
public class ReadPosRankSumTest extends RankSumTest {
public List<String> getKeyNames() { return Arrays.asList("ReadPosRankSum"); }

View File

@ -15,8 +15,9 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* SB annotation value by depth of alt containing samples
*/
public class SBByDepth extends AnnotationByDepth {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -41,7 +41,9 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* List all of the samples in the info field
*/
public class SampleList extends InfoFieldAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -17,6 +17,9 @@ import java.util.List;
import java.util.Map;
/**
* Fraction of reads containing spanning deletions at this site.
*/
public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -19,12 +20,9 @@ import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: 6/29/11
* Time: 3:14 PM
* To change this template use File | Settings | File Templates.
* Counts of bases from SLX, 454, and SOLiD at this site
*/
@Hidden
public class TechnologyComposition extends InfoFieldAnnotation implements ExperimentalAnnotation {
private String nSLX = "NumSLX";
private String n454 ="Num454";

View File

@ -48,27 +48,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// code for testing purposes
//
private final static boolean DEBUG = false;
private final static boolean PRINT_LIKELIHOODS = false;
private final static int N_CYCLES = 1;
private SimpleTimer timerExpt = new SimpleTimer("linearExactBanded");
private SimpleTimer timerGS = new SimpleTimer("linearExactGS");
private final static boolean COMPARE_TO_GS = false;
public enum ExactCalculation {
N2_GOLD_STANDARD,
LINEAR_EXPERIMENTAL
}
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private boolean SIMPLE_GREEDY_GENOTYPER = false;
private final boolean SIMPLE_GREEDY_GENOTYPER = false;
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
final private ExactCalculation calcToUse;
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
calcToUse = UAC.EXACT_CALCULATION_TYPE;
}
public void getLog10PNonRef(RefMetaDataTracker tracker,
@ -76,43 +61,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
Map<String, Genotype> GLs, Set<Allele>alleles,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
// todo -- REMOVE ME AFTER TESTING
// todo -- REMOVE ME AFTER TESTING
// todo -- REMOVE ME AFTER TESTING
double[] gsPosteriors;
if ( COMPARE_TO_GS ) // due to annoying special values in incoming array, we have to clone up here
gsPosteriors = log10AlleleFrequencyPosteriors.clone();
int idxAA = GenotypeType.AA.ordinal();
int idxAB = GenotypeType.AB.ordinal();
int idxBB = GenotypeType.BB.ordinal();
// todo -- remove me after testing
if ( N_CYCLES > 1 ) {
for ( int i = 0; i < N_CYCLES; i++) {
timerGS.restart();
linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone(), idxAA, idxAB, idxBB);
timerGS.stop();
timerExpt.restart();
linearExactBanded(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone());
timerExpt.stop();
}
System.out.printf("good = %.2f, expt = %.2f, delta = %.2f%n",
timerGS.getElapsedTime(), timerExpt.getElapsedTime(), timerExpt.getElapsedTime()-timerGS.getElapsedTime());
}
int lastK = -1;
int numAlleles = alleles.size();
final int numAlleles = alleles.size();
final double[][] posteriorCache = numAlleles > 2 ? new double[numAlleles-1][] : null;
final double[] bestAFguess = numAlleles > 2 ? new double[numAlleles-1] : null;
int idxDiag = numAlleles;
int incr = numAlleles - 1;
double[][] posteriorCache = new double[numAlleles-1][];
double[] bestAFguess = new double[numAlleles-1];
for (int k=1; k < numAlleles; k++) {
// multi-allelic approximation, part 1: Ideally
// for each alt allele compute marginal (suboptimal) posteriors -
@ -121,24 +75,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC.
// 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD
idxAA = 0;
idxAB = k;
final int idxAA = 0;
final int idxAB = k;
// yy is always element on the diagonal.
// 2 alleles: BBelement 2
// 3 alleles: BB element 3. CC element 5
// 4 alleles:
idxBB = idxDiag;
final int idxBB = idxDiag;
idxDiag += incr--;
// todo - possible cleanup
switch ( calcToUse ) {
case N2_GOLD_STANDARD:
lastK = gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
break;
case LINEAR_EXPERIMENTAL:
lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
break;
}
final int lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
if (numAlleles > 2) {
posteriorCache[k-1] = log10AlleleFrequencyPosteriors.clone();
bestAFguess[k-1] = (double)MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors);
@ -153,39 +100,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
log10AlleleFrequencyPosteriors[k] = (posteriorCache[mostLikelyAlleleIdx][k]);
}
// todo -- REMOVE ME AFTER TESTING
// todo -- REMOVE ME AFTER TESTING
// todo -- REMOVE ME AFTER TESTING
if ( COMPARE_TO_GS ) {
gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, gsPosteriors, idxAA, idxAB, idxBB);
double log10thisPVar = Math.log10(MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors)[0]);
double log10gsPVar = Math.log10(MathUtils.normalizeFromLog10(gsPosteriors)[0]);
boolean eq = (log10thisPVar == Double.NEGATIVE_INFINITY && log10gsPVar == Double.NEGATIVE_INFINITY) || MathUtils.compareDoubles(log10thisPVar, log10gsPVar, 1e-4) == 0;
if ( ! eq || PRINT_LIKELIHOODS ) {
System.out.printf("----------------------------------------%n");
for (int k=0; k < log10AlleleFrequencyPosteriors.length; k++) {
double x = log10AlleleFrequencyPosteriors[k];
System.out.printf(" %d\t%.2f\t%.2f\t%b%n", k,
x < -1e10 ? Double.NEGATIVE_INFINITY : x, gsPosteriors[k],
log10AlleleFrequencyPosteriors[k] == gsPosteriors[k]);
}
System.out.printf("MAD_AC\t%d\t%d\t%.2f\t%.2f\t%.6f%n",
ref.getLocus().getStart(), lastK, log10thisPVar, log10gsPVar, log10thisPVar - log10gsPVar);
}
}
}
private static final ArrayList<double[]> getGLs(Map<String, Genotype> GLs) {
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>();
//int j = 0;
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
for ( Genotype sample : GLs.values() ) {
if ( sample.hasLikelihoods() ) {
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
double[] gls = sample.getLikelihoods().getAsVector();
if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL)
@ -240,84 +162,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
}
// now with banding
public int linearExactBanded(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
throw new NotImplementedException();
// final int numSamples = GLs.size();
// final int numChr = 2*numSamples;
// final double[][] genotypeLikelihoods = getGLs(GLs);
//
// final ExactACCache logY = new ExactACCache(numSamples+1);
// logY.getkMinus0()[0] = 0.0; // the zero case
//
// double maxLog10L = Double.NEGATIVE_INFINITY;
// boolean done = false;
// int lastK = -1;
// final int BAND_SIZE = 10;
//
// for (int k=0; k <= numChr && ! done; k++ ) {
// final double[] kMinus0 = logY.getkMinus0();
// int jStart = Math.max(k - BAND_SIZE, 1);
// int jStop = Math.min(k + BAND_SIZE, numSamples);
//
// if ( k == 0 ) { // special case for k = 0
// for ( int j=1; j <= numSamples; j++ ) {
// kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][GenotypeType.AA.ordinal()];
// }
// } else { // k > 0
// final double[] kMinus1 = logY.getkMinus1();
// final double[] kMinus2 = logY.getkMinus2();
// Arrays.fill(kMinus0,0);
//
// for ( int j = jStart; j <= jStop; j++ ) {
// final double[] gl = genotypeLikelihoods[j];
// final double logDenominator = log10Cache[2*j] + log10Cache[2*j-1];
//
// double aa = Double.NEGATIVE_INFINITY;
// double ab = Double.NEGATIVE_INFINITY;
// if (k < 2*j-1)
// aa = log10Cache[2*j-k] + log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()];
//
// if (k < 2*j)
// ab = log10Cache[2*k] + log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()];
//
// double log10Max;
// if (k > 1) {
// final double bb = log10Cache[k] + log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()];
// log10Max = approximateLog10SumLog10(aa, ab, bb);
// } else {
// // we know we aren't considering the BB case, so we can use an optimized log10 function
// log10Max = approximateLog10SumLog10(aa, ab);
// }
//
// // finally, update the L(j,k) value
// kMinus0[j] = log10Max - logDenominator;
//
// String offset = Utils.dupString(' ',k);
// System.out.printf("%s%3d %3d %.2f%n", offset, k, j, kMinus0[j]);
// }
// }
//
// // update the posteriors vector
// final double log10LofK = kMinus0[jStop];
// log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k];
//
// // can we abort early?
// lastK = k;
// maxLog10L = Math.max(maxLog10L, log10LofK);
// if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
// if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
// done = true;
// }
//
// logY.rotate();
// }
//
// return lastK;
}
public int linearExact(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
@ -605,82 +449,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
return calls;
}
// -------------------------------------------------------------------------------------
//
// Gold standard, but O(N^2), implementation.
//
// TODO -- remove me for clarity in this code
//
// -------------------------------------------------------------------------------------
public int gdaN2GoldStandard(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
int numSamples = GLs.size();
int numChr = 2*numSamples;
double[][] logYMatrix = new double[1+numSamples][1+numChr];
for (int i=0; i <=numSamples; i++)
for (int j=0; j <=numChr; j++)
logYMatrix[i][j] = Double.NEGATIVE_INFINITY;
//YMatrix[0][0] = 1.0;
logYMatrix[0][0] = 0.0;
int j=0;
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
j++;
if ( !sample.getValue().hasLikelihoods() )
continue;
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector();
//double logDenominator = Math.log10(2.0*j*(2.0*j-1));
double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
// special treatment for k=0: iteration reduces to:
//YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()];
logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[idxAA];
for (int k=1; k <= 2*j; k++ ) {
//double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()];
double logNumerator[];
logNumerator = new double[3];
if (k < 2*j-1)
logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] +
genotypeLikelihoods[idxAA];
else
logNumerator[0] = Double.NEGATIVE_INFINITY;
if (k < 2*j)
logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] +
genotypeLikelihoods[idxAB];
else
logNumerator[1] = Double.NEGATIVE_INFINITY;
if (k > 1)
logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] +
genotypeLikelihoods[idxBB];
else
logNumerator[2] = Double.NEGATIVE_INFINITY;
double logNum = MathUtils.softMax(logNumerator);
//YMatrix[j][k] = num/den;
logYMatrix[j][k] = logNum - logDenominator;
}
}
for (int k=0; k <= numChr; k++)
log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
return numChr;
}
private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) {
int j = logYMatrix.length - 1;
System.out.printf("-----------------------------------%n");
@ -689,5 +457,4 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior);
}
}
}

View File

@ -168,10 +168,6 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "GSA_PRODUCTION_ONLY", shortName = "GSA_PRODUCTION_ONLY", doc = "don't ever use me", required = false)
public boolean GSA_PRODUCTION_ONLY = false;
@Hidden
@Argument(fullName = "exactCalculation", shortName = "exactCalculation", doc = "expt", required = false)
public ExactAFCalculationModel.ExactCalculation EXACT_CALCULATION_TYPE = ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL;
@Hidden
@Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
public boolean IGNORE_SNP_ALLELES = false;
@ -191,7 +187,6 @@ public class UnifiedArgumentCollection {
uac.GLmodel = GLmodel;
uac.AFmodel = AFmodel;
uac.EXACT_CALCULATION_TYPE = EXACT_CALCULATION_TYPE;
uac.heterozygosity = heterozygosity;
uac.PCR_error = PCR_error;
uac.GenotypingMode = GenotypingMode;

View File

@ -42,6 +42,7 @@ import java.util.*;
*
* <p>Body test</p>
*/
@Hidden
public class DocumentationTest extends RodWalker<Integer, Integer> {
// the docs for the arguments are in the collection
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();

View File

@ -432,25 +432,37 @@ public class ClippingOp {
}
private int calculateAlignmentStartShift(Cigar oldCigar, Cigar newCigar) {
int shift = 0;
int newShift = 0;
int oldShift = 0;
int deletionShift = 0;
// Rewind to previous start (by counting everything that was already clipped in this read)
for (CigarElement cigarElement : oldCigar.getCigarElements()) {
if (!cigarElement.getOperator().consumesReferenceBases())
shift -= cigarElement.getLength();
else
break;
}
// Advance to new start (by counting everything new that has been clipped )
for (CigarElement cigarElement : newCigar.getCigarElements()) {
if (!cigarElement.getOperator().consumesReferenceBases())
shift += cigarElement.getLength();
if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP)
newShift += cigarElement.getLength();
else
break;
}
return shift;
for (CigarElement cigarElement : oldCigar.getCigarElements()) {
if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP )
oldShift += Math.min(cigarElement.getLength(), newShift - oldShift);
else
break;
}
int basesClipped = 0;
for (CigarElement cigarElement : oldCigar.getCigarElements()) {
if (basesClipped > newShift) // are we beyond the clipped region?
break;
else if (cigarElement.getOperator() == CigarOperator.DELETION) // if this is a deletion, we have to adjust the starting shift
deletionShift += cigarElement.getLength();
else
basesClipped += cigarElement.getLength();
}
return newShift - oldShift + deletionShift;
}
private int calculateHardClippingAlignmentShift(CigarElement cigarElement, int clippedLength) {
@ -462,8 +474,8 @@ public class ClippingOp {
return -clippedLength;
}
if (cigarElement.getOperator() == CigarOperator.DELETION)
return cigarElement.getLength();
// if (cigarElement.getOperator() == CigarOperator.DELETION)
// return cigarElement.getLength();
return 0;
}

View File

@ -71,21 +71,21 @@ public class ReadClipper {
private SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart);
int stop = (refStop < 0) ? read.getReadLength() - 1: ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop);
int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop);
if (start < 0 || stop > read.getReadLength() - 1 + numDeletions(read))
if (start < 0 || stop > read.getReadLength() - 1)
throw new ReviewedStingException("Trying to clip before the start or after the end of a read");
// TODO add check in the Hardclip function
if ( start > stop )
stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, ReadUtils.getRefCoordSoftUnclippedEnd(read));
if ( start > stop ) {
// stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, ReadUtils.getRefCoordSoftUnclippedEnd(read));
throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!");
}
//This tries to fix the bug where the deletion is counted a read base and as a result, the hardCLipper runs into
//an endless loop when hard clipping the cigar string because the read coordinates are not covered by the read
stop -= numDeletions(read);
if ( start > stop )
start -= numDeletions(read);
// stop -= numDeletions(read);
// if ( start > stop )
// start -= numDeletions(read);
//System.out.println("Clipping start/stop: " + start + "/" + stop);
@ -145,7 +145,7 @@ public class ReadClipper {
cutLeft = readIndex + cigarElement.getLength() - 1;
}
}
else
else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP)
rightTail = true;
if (cigarElement.getOperator().consumesReadBases())

View File

@ -66,7 +66,8 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler {
public boolean includeInDocs(ClassDoc doc) {
try {
Class type = HelpUtils.getClassForDoc(doc);
return JVMUtils.isConcrete(type);
boolean hidden = ! getDoclet().showHiddenFeatures() && type.isAnnotationPresent(Hidden.class);
return ! hidden && JVMUtils.isConcrete(type);
} catch ( ClassNotFoundException e ) {
return false;
}

View File

@ -74,13 +74,6 @@ public class Genotype {
return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attributes, g.isPhased());
}
public static Genotype removePLs(Genotype g) {
Map<String, Object> attrs = new HashMap<String, Object>(g.getAttributes());
attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY);
attrs.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attrs, g.isPhased());
}
public static Genotype modifyAlleles(Genotype g, List<Allele> alleles) {
return new Genotype(g.getSampleName(), alleles, g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased());
}

View File

@ -44,6 +44,11 @@ import java.io.Serializable;
import java.util.*;
public class VariantContextUtils {
public final static String MERGE_INTERSECTION = "Intersection";
public final static String MERGE_FILTER_IN_ALL = "FilteredInAll";
public final static String MERGE_REF_IN_ALL = "ReferenceInAll";
public final static String MERGE_FILTER_PREFIX = "filterIn";
final public static JexlEngine engine = new JexlEngine();
static {
engine.setSilent(false); // will throw errors now for selects that don't evaluate properly
@ -154,6 +159,13 @@ public class VariantContextUtils {
return "%." + precision + "f";
}
public static Genotype removePLs(Genotype g) {
Map<String, Object> attrs = new HashMap<String, Object>(g.getAttributes());
attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY);
attrs.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attrs, g.isPhased());
}
/**
* A simple but common wrapper for matching VariantContext objects using JEXL expressions
*/
@ -609,19 +621,20 @@ public class VariantContextUtils {
if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() )
filters.clear();
if ( annotateOrigin ) { // we care about where the call came from
String setValue;
if ( nFiltered == 0 && variantSources.size() == priorityListOfVCs.size() ) // nothing was unfiltered
setValue = "Intersection";
setValue = MERGE_INTERSECTION;
else if ( nFiltered == VCs.size() ) // everything was filtered out
setValue = "FilteredInAll";
setValue = MERGE_FILTER_IN_ALL;
else if ( variantSources.isEmpty() ) // everyone was reference
setValue = "ReferenceInAll";
setValue = MERGE_REF_IN_ALL;
else {
LinkedHashSet<String> s = new LinkedHashSet<String>();
for ( VariantContext vc : VCs )
if ( vc.isVariant() )
s.add( vc.isFiltered() ? "filterIn" + vc.getSource() : vc.getSource() );
s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() );
setValue = Utils.join("-", s);
}
@ -734,7 +747,7 @@ public class VariantContextUtils {
Map<String, Genotype> newGs = new HashMap<String, Genotype>(genotypes.size());
for ( Map.Entry<String, Genotype> g : genotypes.entrySet() ) {
newGs.put(g.getKey(), g.getValue().hasLikelihoods() ? Genotype.removePLs(g.getValue()) : g.getValue());
newGs.put(g.getKey(), g.getValue().hasLikelihoods() ? removePLs(g.getValue()) : g.getValue());
}
return newGs;

View File

@ -132,15 +132,21 @@ public abstract class BaseTest {
*/
public static class TestDataProvider {
private static final Map<Class, List<Object>> tests = new HashMap<Class, List<Object>>();
private final String name;
/**
* Create a new TestDataProvider instance bound to the class variable C
* @param c
*/
public TestDataProvider(Class c) {
public TestDataProvider(Class c, String name) {
if ( ! tests.containsKey(c) )
tests.put(c, new ArrayList<Object>());
tests.get(c).add(this);
this.name = name;
}
public TestDataProvider(Class c) {
this(c, "");
}
/**
@ -153,6 +159,11 @@ public abstract class BaseTest {
for ( Object x : tests.get(c) ) params2.add(new Object[]{x});
return params2.toArray(new Object[][]{});
}
@Override
public String toString() {
return "TestDataProvider("+name+")";
}
}
/**

View File

@ -33,7 +33,7 @@ public class SymbolicAllelesIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(b36KGReference, "symbolic_alleles_2.vcf"),
1,
Arrays.asList("6645babc8c7d46be0da223477c7b1291"));
Arrays.asList("3008d6f5044bc14801e5c58d985dec72"));
executeTest("Test symbolic alleles mixed in with non-symbolic alleles", spec);
}
}

View File

@ -21,37 +21,25 @@
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
// our package
package org.broadinstitute.sting.utils.variantcontext;
// the imports for unit testing.
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.apache.log4j.Priority;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.Test;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
public class VariantContextUtilsUnitTest extends BaseTest {
Allele Aref, T, delRef, ATC;
Genotype ref1, snp1, snp2, indel1, indelref;
Allele Aref, T, C, delRef, ATC, ATCATC;
private GenomeLocParser genomeLocParser;
VariantContext refVC, snpVC1, snpVC2, snpVC3, snpVC4, indelVC1, indelVC2, indelVC3;
@BeforeSuite
public void setup() {
@ -68,28 +56,35 @@ public class VariantContextUtilsUnitTest extends BaseTest {
Aref = Allele.create("A", true);
delRef = Allele.create("-", true);
T = Allele.create("T");
C = Allele.create("C");
ATC = Allele.create("ATC");
ATCATC = Allele.create("ATCATC");
}
ref1 = new Genotype("ref1", Arrays.asList(Aref, Aref), 5, new double[]{0, 5, 10});
snp1 = new Genotype("snp1", Arrays.asList(Aref,T), 10, new double[]{10, 0, 20});
snp2 = new Genotype("snp2", Arrays.asList(T,T), 15, new double[]{25, 15, 0});
indelref = new Genotype("indelref", Arrays.asList(delRef,delRef), 25, new double[]{0, 25, 30});
indel1 = new Genotype("indel1", Arrays.asList(delRef,ATC), 20, new double[]{20, 0, 30});
private Genotype makeG(String sample, Allele a1, Allele a2) {
return new Genotype(sample, Arrays.asList(a1, a2));
}
refVC = makeVC("refvc", Arrays.asList(Aref), Arrays.asList(ref1));
snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T), Arrays.asList(snp1));
snpVC2 = makeVC("snpvc2", Arrays.asList(Aref, T), Arrays.asList(snp1, snp2));
snpVC3 = makeVC("snpvc3", Arrays.asList(Aref, T), Arrays.asList(ref1, snp1));
snpVC4 = makeVC("snpvc4", Arrays.asList(Aref, T), Arrays.asList(ref1, snp1, snp2));
indelVC1 = makeVC("indelvc1", Arrays.asList(delRef), Arrays.asList(indelref));
indelVC2 = makeVC("indelvc2", Arrays.asList(delRef, ATC), Arrays.asList(indel1));
indelVC3 = makeVC("indelvc3", Arrays.asList(delRef, ATC), Arrays.asList(indelref, indel1));
private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError) {
return new Genotype(sample, Arrays.asList(a1, a2), log10pError);
}
private VariantContext makeVC(String source, List<Allele> alleles) {
return makeVC(source, alleles, null, null);
}
private VariantContext makeVC(String source, List<Allele> alleles, Genotype... g1) {
return makeVC(source, alleles, Arrays.asList(g1));
}
private VariantContext makeVC(String source, List<Allele> alleles, String filter) {
return makeVC(source, alleles, filter.equals(".") ? null : new HashSet<String>(Arrays.asList(filter)));
}
private VariantContext makeVC(String source, List<Allele> alleles, Set<String> filters) {
return makeVC(source, alleles, null, filters);
}
private VariantContext makeVC(String source, List<Allele> alleles, Collection<Genotype> genotypes) {
return makeVC(source, alleles, genotypes, null);
}
@ -98,45 +93,108 @@ public class VariantContextUtilsUnitTest extends BaseTest {
int start = 10;
int stop = start; // alleles.contains(ATC) ? start + 3 : start;
return new VariantContext(source, "1", start, stop, alleles,
VariantContext.genotypeCollectionToMap(new TreeMap<String, Genotype>(), genotypes),
genotypes == null ? null : VariantContext.genotypeCollectionToMap(new TreeMap<String, Genotype>(), genotypes),
1.0, filters, null, (byte)'C');
}
private class SimpleMergeTest extends TestDataProvider {
List<VariantContext> inputVCs;
VariantContext expectedVC;
// --------------------------------------------------------------------------------
//
// Test allele merging
//
// --------------------------------------------------------------------------------
private SimpleMergeTest(VariantContext... vcsArg) {
super(SimpleMergeTest.class);
LinkedList<VariantContext> allVCs = new LinkedList<VariantContext>(Arrays.asList(vcsArg));
expectedVC = allVCs.pollLast();
inputVCs = allVCs;
private class MergeAllelesTest extends TestDataProvider {
List<List<Allele>> inputs;
List<Allele> expected;
private MergeAllelesTest(List<Allele>... arg) {
super(MergeAllelesTest.class);
LinkedList<List<Allele>> all = new LinkedList<List<Allele>>(Arrays.asList(arg));
expected = all.pollLast();
inputs = all;
}
public String toString() {
return String.format("SimpleMergeTest vc=%s expected=%s", inputVCs, expectedVC);
return String.format("MergeAllelesTest input=%s expected=%s", inputs, expected);
}
}
@DataProvider(name = "simplemergedata")
public Object[][] createSimpleMergeData() {
@DataProvider(name = "mergeAlleles")
public Object[][] mergeAllelesData() {
// first, do no harm
new SimpleMergeTest(refVC, refVC);
new SimpleMergeTest(snpVC1, snpVC1);
new SimpleMergeTest(indelVC1, indelVC1);
new SimpleMergeTest(indelVC3, indelVC3);
new MergeAllelesTest(Arrays.asList(Aref),
Arrays.asList(Aref));
new SimpleMergeTest(refVC, snpVC1, snpVC3);
new SimpleMergeTest(snpVC1, snpVC2, snpVC2);
new SimpleMergeTest(refVC, snpVC2, snpVC4);
new MergeAllelesTest(Arrays.asList(Aref),
Arrays.asList(Aref),
Arrays.asList(Aref));
new SimpleMergeTest(indelVC1, indelVC2, indelVC3);
new SimpleMergeTest(indelVC1, indelVC3, indelVC3);
new SimpleMergeTest(indelVC2, indelVC3, indelVC3);
new MergeAllelesTest(Arrays.asList(Aref),
Arrays.asList(Aref, T),
Arrays.asList(Aref, T));
return SimpleMergeTest.getTests(SimpleMergeTest.class);
new MergeAllelesTest(Arrays.asList(Aref, C),
Arrays.asList(Aref, T),
Arrays.asList(Aref, C, T));
new MergeAllelesTest(Arrays.asList(Aref, T),
Arrays.asList(Aref, C),
Arrays.asList(Aref, C, T)); // sorted by allele
new MergeAllelesTest(Arrays.asList(Aref, C, T),
Arrays.asList(Aref, C),
Arrays.asList(Aref, C, T));
new MergeAllelesTest(Arrays.asList(Aref, T, C),
Arrays.asList(Aref, C),
Arrays.asList(Aref, C, T)); // sorted by allele
new MergeAllelesTest(Arrays.asList(delRef),
Arrays.asList(delRef)); // todo -- FIXME me GdA
new MergeAllelesTest(Arrays.asList(delRef),
Arrays.asList(delRef, ATC),
Arrays.asList(delRef, ATC));
new MergeAllelesTest(Arrays.asList(delRef),
Arrays.asList(delRef, ATC, ATCATC),
Arrays.asList(delRef, ATC, ATCATC));
new MergeAllelesTest(Arrays.asList(delRef, ATCATC),
Arrays.asList(delRef, ATC, ATCATC),
Arrays.asList(delRef, ATC, ATCATC));
new MergeAllelesTest(Arrays.asList(delRef, ATC),
Arrays.asList(delRef, ATCATC),
Arrays.asList(delRef, ATC, ATCATC));
return MergeAllelesTest.getTests(MergeAllelesTest.class);
}
@Test(dataProvider = "mergeAlleles")
public void testMergeAlleles(MergeAllelesTest cfg) {
final List<VariantContext> inputs = new ArrayList<VariantContext>();
int i = 0;
for ( final List<Allele> alleles : cfg.inputs ) {
final String name = "vcf" + ++i;
inputs.add(makeVC(name, alleles));
}
final List<String> priority = vcs2priority(inputs);
final VariantContext merged = VariantContextUtils.simpleMerge(genomeLocParser,
inputs, priority,
VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false);
Assert.assertEquals(merged.getAlleles(), cfg.expected);
}
// --------------------------------------------------------------------------------
//
// Test rsID merging
//
// --------------------------------------------------------------------------------
private class SimpleMergeRSIDTest extends TestDataProvider {
List<String> inputs;
String expected;
@ -156,10 +214,13 @@ public class VariantContextUtilsUnitTest extends BaseTest {
@DataProvider(name = "simplemergersiddata")
public Object[][] createSimpleMergeRSIDData() {
new SimpleMergeRSIDTest(".", ".");
new SimpleMergeRSIDTest(".", ".", ".");
new SimpleMergeRSIDTest("rs1", "rs1");
new SimpleMergeRSIDTest("rs1", "rs1", "rs1");
new SimpleMergeRSIDTest(".", "rs1", "rs1");
new SimpleMergeRSIDTest("rs1", ".", "rs1");
new SimpleMergeRSIDTest("rs1", "rs2", "rs1,rs2");
new SimpleMergeRSIDTest("rs1", "rs2", "rs1", "rs1,rs2"); // duplicates
new SimpleMergeRSIDTest("rs2", "rs1", "rs2,rs1");
new SimpleMergeRSIDTest("rs2", "rs1", ".", "rs2,rs1");
new SimpleMergeRSIDTest("rs2", ".", "rs1", "rs2,rs1");
@ -171,32 +232,312 @@ public class VariantContextUtilsUnitTest extends BaseTest {
@Test(dataProvider = "simplemergersiddata")
public void testRSIDMerge(SimpleMergeRSIDTest cfg) {
List<VariantContext> inputs = new ArrayList<VariantContext>();
for ( String id : cfg.inputs ) {
final VariantContext snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T));
final List<VariantContext> inputs = new ArrayList<VariantContext>();
for ( final String id : cfg.inputs ) {
MutableVariantContext vc = new MutableVariantContext(snpVC1);
if ( ! id.equals(".") ) vc.setID(id);
inputs.add(vc);
}
VariantContext merged = myMerge(inputs);
final VariantContext merged = VariantContextUtils.simpleMerge(genomeLocParser,
inputs, null,
VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false);
Assert.assertEquals(merged.getID(), cfg.expected.equals(".") ? null : cfg.expected);
}
private VariantContext myMerge(List<VariantContext> inputs) {
List<String> priority = new ArrayList<String>();
for ( VariantContext vc : inputs ) priority.add(vc.getSource());
// --------------------------------------------------------------------------------
//
// Test filtered merging
//
// --------------------------------------------------------------------------------
return VariantContextUtils.simpleMerge(genomeLocParser,
inputs, priority,
VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
VariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false);
private class MergeFilteredTest extends TestDataProvider {
List<VariantContext> inputs;
VariantContext expected;
String setExpected;
VariantContextUtils.FilteredRecordMergeType type;
private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, String setExpected) {
this(name, input1, input2, expected, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, setExpected);
}
private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, VariantContextUtils.FilteredRecordMergeType type, String setExpected) {
super(MergeFilteredTest.class, name);
LinkedList<VariantContext> all = new LinkedList<VariantContext>(Arrays.asList(input1, input2));
this.expected = expected;
this.type = type;
inputs = all;
this.setExpected = setExpected;
}
public String toString() {
return String.format("%s input=%s expected=%s", super.toString(), inputs, expected);
}
}
// todo -- add tests for subset merging, especially with correct PLs
// todo -- test priority list
// todo -- test FilteredRecordMergeType
// todo -- no annotate origin
// todo -- test set key
// todo -- test filtered are uncalled
@DataProvider(name = "mergeFiltered")
public Object[][] mergeFilteredData() {
new MergeFilteredTest("AllPass",
makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS),
makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS),
makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS),
VariantContextUtils.MERGE_INTERSECTION);
new MergeFilteredTest("noFilters",
makeVC("1", Arrays.asList(Aref, T), "."),
makeVC("2", Arrays.asList(Aref, T), "."),
makeVC("3", Arrays.asList(Aref, T), "."),
VariantContextUtils.MERGE_INTERSECTION);
new MergeFilteredTest("oneFiltered",
makeVC("1", Arrays.asList(Aref, T), "."),
makeVC("2", Arrays.asList(Aref, T), "FAIL"),
makeVC("3", Arrays.asList(Aref, T), "."),
String.format("1-%s2", VariantContextUtils.MERGE_FILTER_PREFIX));
new MergeFilteredTest("onePassOneFail",
makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS),
makeVC("2", Arrays.asList(Aref, T), "FAIL"),
makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS),
String.format("1-%s2", VariantContextUtils.MERGE_FILTER_PREFIX));
new MergeFilteredTest("AllFiltered",
makeVC("1", Arrays.asList(Aref, T), "FAIL"),
makeVC("2", Arrays.asList(Aref, T), "FAIL"),
makeVC("3", Arrays.asList(Aref, T), "FAIL"),
VariantContextUtils.MERGE_FILTER_IN_ALL);
// test ALL vs. ANY
new MergeFilteredTest("FailOneUnfiltered",
makeVC("1", Arrays.asList(Aref, T), "FAIL"),
makeVC("2", Arrays.asList(Aref, T), "."),
makeVC("3", Arrays.asList(Aref, T), "."),
VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
String.format("%s1-2", VariantContextUtils.MERGE_FILTER_PREFIX));
new MergeFilteredTest("OneFailAllUnfilteredArg",
makeVC("1", Arrays.asList(Aref, T), "FAIL"),
makeVC("2", Arrays.asList(Aref, T), "."),
makeVC("3", Arrays.asList(Aref, T), "FAIL"),
VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ALL_UNFILTERED,
String.format("%s1-2", VariantContextUtils.MERGE_FILTER_PREFIX));
// test excluding allele in filtered record
new MergeFilteredTest("DontIncludeAlleleOfFilteredRecords",
makeVC("1", Arrays.asList(Aref, T), "."),
makeVC("2", Arrays.asList(Aref, T), "FAIL"),
makeVC("3", Arrays.asList(Aref, T), "."),
String.format("1-%s2", VariantContextUtils.MERGE_FILTER_PREFIX));
// promotion of site from unfiltered to PASSES
new MergeFilteredTest("UnfilteredPlusPassIsPass",
makeVC("1", Arrays.asList(Aref, T), "."),
makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS),
makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS),
VariantContextUtils.MERGE_INTERSECTION);
new MergeFilteredTest("RefInAll",
makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS),
makeVC("2", Arrays.asList(Aref), VariantContext.PASSES_FILTERS),
makeVC("3", Arrays.asList(Aref), VariantContext.PASSES_FILTERS),
VariantContextUtils.MERGE_REF_IN_ALL);
new MergeFilteredTest("RefInOne",
makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS),
makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS),
makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS),
"2");
return MergeFilteredTest.getTests(MergeFilteredTest.class);
}
@Test(dataProvider = "mergeFiltered")
public void testMergeFiltered(MergeFilteredTest cfg) {
final List<String> priority = vcs2priority(cfg.inputs);
final VariantContext merged = VariantContextUtils.simpleMerge(genomeLocParser,
cfg.inputs, priority, cfg.type, VariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false);
// test alleles are equal
Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles());
// test set field
Assert.assertEquals(merged.getAttribute("set"), cfg.setExpected);
// test filter field
Assert.assertEquals(merged.getFilters(), cfg.expected.getFilters());
}
// --------------------------------------------------------------------------------
//
// Test genotype merging
//
// --------------------------------------------------------------------------------
private class MergeGenotypesTest extends TestDataProvider {
List<VariantContext> inputs;
VariantContext expected;
List<String> priority;
private MergeGenotypesTest(String name, String priority, VariantContext... arg) {
super(MergeGenotypesTest.class, name);
LinkedList<VariantContext> all = new LinkedList<VariantContext>(Arrays.asList(arg));
this.expected = all.pollLast();
inputs = all;
this.priority = Arrays.asList(priority.split(","));
}
public String toString() {
return String.format("%s input=%s expected=%s", super.toString(), inputs, expected);
}
}
@DataProvider(name = "mergeGenotypes")
public Object[][] mergeGenotypesData() {
new MergeGenotypesTest("TakeGenotypeByPriority-1,2", "1,2",
makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)),
makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2)),
makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)));
new MergeGenotypesTest("TakeGenotypeByPriority-1,2-nocall", "1,2",
makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, 1)),
makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2)),
makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, 1)));
new MergeGenotypesTest("TakeGenotypeByPriority-2,1", "2,1",
makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)),
makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2)),
makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2)));
new MergeGenotypesTest("NonOverlappingGenotypes", "1,2",
makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)),
makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, 2)),
makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1), makeG("s2", Aref, T, 2)));
new MergeGenotypesTest("PreserveNoCall", "1,2",
makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, 1)),
makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, 2)),
makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, 1), makeG("s2", Aref, T, 2)));
new MergeGenotypesTest("PerserveAlleles", "1,2",
makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)),
makeVC("2", Arrays.asList(Aref, C), makeG("s2", Aref, C, 2)),
makeVC("3", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, 1), makeG("s2", Aref, C, 2)));
new MergeGenotypesTest("TakeGenotypePartialOverlap-1,2", "1,2",
makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)),
makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3)),
makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1), makeG("s3", Aref, T, 3)));
new MergeGenotypesTest("TakeGenotypePartialOverlap-2,1", "2,1",
makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1)),
makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3)),
makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2), makeG("s3", Aref, T, 3)));
// todo -- GDA -- add tests for merging correct PLs
return MergeGenotypesTest.getTests(MergeGenotypesTest.class);
}
@Test(dataProvider = "mergeGenotypes")
public void testMergeGenotypes(MergeGenotypesTest cfg) {
final VariantContext merged = VariantContextUtils.simpleMerge(genomeLocParser,
cfg.inputs, cfg.priority, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
VariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false);
// test alleles are equal
Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles());
// test genotypes
assertGenotypesAreMostlyEqual(merged.getGenotypes(), cfg.expected.getGenotypes());
}
// necessary to not overload equals for genotypes
private void assertGenotypesAreMostlyEqual(Map<String, Genotype> actual, Map<String, Genotype> expected) {
if (actual == expected) {
return;
}
if (actual == null || expected == null) {
Assert.fail("Maps not equal: expected: " + expected + " and actual: " + actual);
}
if (actual.size() != expected.size()) {
Assert.fail("Maps do not have the same size:" + actual.size() + " != " + expected.size());
}
for (Map.Entry<String, Genotype> entry : actual.entrySet()) {
String key = entry.getKey();
Genotype value = entry.getValue();
Genotype expectedValue = expected.get(key);
Assert.assertEquals(value.alleles, expectedValue.alleles);
Assert.assertEquals(value.getNegLog10PError(), expectedValue.getNegLog10PError());
Assert.assertEquals(value.hasLikelihoods(), expectedValue.hasLikelihoods());
if ( value.hasLikelihoods() )
Assert.assertEquals(value.getLikelihoods(), expectedValue.getLikelihoods());
}
}
@Test
public void testMergeGenotypesUniquify() {
final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1));
final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2));
final VariantContext merged = VariantContextUtils.simpleMerge(genomeLocParser,
Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
VariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false);
// test genotypes
Assert.assertEquals(merged.getGenotypes().keySet(), new HashSet<String>(Arrays.asList("s1.1", "s1.2")));
}
@Test(expectedExceptions = UserException.class)
public void testMergeGenotypesRequireUnique() {
final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, 1));
final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, 2));
final VariantContext merged = VariantContextUtils.simpleMerge(genomeLocParser,
Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false);
}
// --------------------------------------------------------------------------------
//
// Misc. tests
//
// --------------------------------------------------------------------------------
@Test
public void testAnnotationSet() {
for ( final boolean annotate : Arrays.asList(true, false)) {
for ( final String set : Arrays.asList("set", "combine", "x")) {
final List<String> priority = Arrays.asList("1", "2");
VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS);
VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS);
final VariantContext merged = VariantContextUtils.simpleMerge(genomeLocParser,
Arrays.asList(vc1, vc2), priority, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
VariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false);
if ( annotate )
Assert.assertEquals(merged.getAttribute(set), VariantContextUtils.MERGE_INTERSECTION);
else
Assert.assertFalse(merged.hasAttribute(set));
}
}
}
private static final List<String> vcs2priority(final Collection<VariantContext> vcs) {
final List<String> priority = new ArrayList<String>();
for ( final VariantContext vc : vcs ) {
priority.add(vc.getSource());
}
return priority;
}
}