Fixing merge conflicts
This commit is contained in:
commit
7bbd2a7a20
|
|
@ -56,8 +56,12 @@ public class GenotypingEngine {
|
|||
|
||||
// This function is the streamlined approach, currently not being used
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList<Haplotype> haplotypes, final byte[] ref, final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser ) {
|
||||
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine,
|
||||
final ArrayList<Haplotype> haplotypes,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow,
|
||||
final GenomeLocParser genomeLocParser ) {
|
||||
// Prepare the list of haplotype indices to genotype
|
||||
final ArrayList<Allele> allelesToGenotype = new ArrayList<Allele>();
|
||||
|
||||
|
|
@ -224,7 +228,6 @@ public class GenotypingEngine {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// Walk along each position in the key set and create each event to be outputted
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
|
||||
|
|
@ -533,24 +536,36 @@ public class GenotypingEngine {
|
|||
final int elementLength = ce.getLength();
|
||||
switch( ce.getOperator() ) {
|
||||
case I:
|
||||
{
|
||||
final ArrayList<Allele> insertionAlleles = new ArrayList<Allele>();
|
||||
final int insertionStart = refLoc.getStart() + refPos - 1;
|
||||
insertionAlleles.add( Allele.create(ref[refPos-1], true) );
|
||||
final byte refByte = ref[refPos-1];
|
||||
if( BaseUtils.isRegularBase(refByte) ) {
|
||||
insertionAlleles.add( Allele.create(refByte, true) );
|
||||
}
|
||||
if( haplotype != null && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) {
|
||||
insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
|
||||
} else {
|
||||
byte[] insertionBases = new byte[]{};
|
||||
insertionBases = ArrayUtils.add(insertionBases, ref[refPos-1]); // add the padding base
|
||||
insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength ));
|
||||
insertionAlleles.add( Allele.create(insertionBases, false) );
|
||||
if( BaseUtils.isAllRegularBases(insertionBases) ) {
|
||||
insertionAlleles.add( Allele.create(insertionBases, false) );
|
||||
}
|
||||
}
|
||||
if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele
|
||||
vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
|
||||
}
|
||||
vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
|
||||
alignmentPos += elementLength;
|
||||
break;
|
||||
}
|
||||
case S:
|
||||
{
|
||||
alignmentPos += elementLength;
|
||||
break;
|
||||
}
|
||||
case D:
|
||||
{
|
||||
final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base
|
||||
final ArrayList<Allele> deletionAlleles = new ArrayList<Allele>();
|
||||
final int deletionStart = refLoc.getStart() + refPos - 1;
|
||||
|
|
@ -561,15 +576,20 @@ public class GenotypingEngine {
|
|||
// deletionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
|
||||
// vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart, deletionAlleles).make());
|
||||
//} else {
|
||||
final byte refByte = ref[refPos-1];
|
||||
if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) {
|
||||
deletionAlleles.add( Allele.create(deletionBases, true) );
|
||||
deletionAlleles.add( Allele.create(ref[refPos-1], false) );
|
||||
deletionAlleles.add( Allele.create(refByte, false) );
|
||||
vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make());
|
||||
}
|
||||
//}
|
||||
refPos += elementLength;
|
||||
break;
|
||||
}
|
||||
case M:
|
||||
case EQ:
|
||||
case X:
|
||||
{
|
||||
int numSinceMismatch = -1;
|
||||
int stopOfMismatch = -1;
|
||||
int startOfMismatch = -1;
|
||||
|
|
@ -592,11 +612,13 @@ public class GenotypingEngine {
|
|||
if( numSinceMismatch > MNP_LOOK_AHEAD || (iii == elementLength - 1 && stopOfMismatch != -1) ) {
|
||||
final byte[] refBases = Arrays.copyOfRange( ref, refPosStartOfMismatch, refPosStartOfMismatch + (stopOfMismatch - startOfMismatch) + 1 );
|
||||
final byte[] mismatchBases = Arrays.copyOfRange( alignment, startOfMismatch, stopOfMismatch + 1 );
|
||||
final ArrayList<Allele> snpAlleles = new ArrayList<Allele>();
|
||||
snpAlleles.add( Allele.create( refBases, true ) );
|
||||
snpAlleles.add( Allele.create( mismatchBases, false ) );
|
||||
final int snpStart = refLoc.getStart() + refPosStartOfMismatch;
|
||||
vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make());
|
||||
if( BaseUtils.isAllRegularBases(refBases) && BaseUtils.isAllRegularBases(mismatchBases) ) {
|
||||
final ArrayList<Allele> snpAlleles = new ArrayList<Allele>();
|
||||
snpAlleles.add( Allele.create( refBases, true ) );
|
||||
snpAlleles.add( Allele.create( mismatchBases, false ) );
|
||||
final int snpStart = refLoc.getStart() + refPosStartOfMismatch;
|
||||
vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make());
|
||||
}
|
||||
numSinceMismatch = -1;
|
||||
stopOfMismatch = -1;
|
||||
startOfMismatch = -1;
|
||||
|
|
@ -606,6 +628,7 @@ public class GenotypingEngine {
|
|||
alignmentPos++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case N:
|
||||
case H:
|
||||
case P:
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
|
|
@ -186,7 +187,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
protected String[] annotationClassesToUse = { "Standard" };
|
||||
|
||||
@ArgumentCollection
|
||||
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection();
|
||||
|
||||
// the calculation arguments
|
||||
private UnifiedGenotyperEngine UG_engine = null;
|
||||
|
|
@ -237,7 +238,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
samplesList.addAll( samples );
|
||||
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
|
||||
UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; // the GLmodel isn't used by the HaplotypeCaller but it is dangerous to let the user change this argument
|
||||
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
|
|
@ -410,45 +411,48 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
|
||||
for( final Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>> callResult :
|
||||
( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
|
||||
? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser() )
|
||||
? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getExtendedLoc(), getToolkit().getGenomeLocParser() )
|
||||
: genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
|
||||
if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult );
|
||||
final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
|
||||
|
||||
// add some custom annotations to the calls
|
||||
final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());
|
||||
// Calculate the number of variants on the haplotype
|
||||
int maxNumVar = 0;
|
||||
for( final Allele allele : callResult.getFirst().getAlleles() ) {
|
||||
if( !allele.isReference() ) {
|
||||
for( final Haplotype haplotype : callResult.getSecond().get(allele) ) {
|
||||
final int numVar = haplotype.getEventMap().size();
|
||||
if( numVar > maxNumVar ) { maxNumVar = numVar; }
|
||||
|
||||
if( !GENOTYPE_FULL_ACTIVE_REGION ) {
|
||||
// add some custom annotations to the calls
|
||||
|
||||
// Calculate the number of variants on the haplotype
|
||||
int maxNumVar = 0;
|
||||
for( final Allele allele : callResult.getFirst().getAlleles() ) {
|
||||
if( !allele.isReference() ) {
|
||||
for( final Haplotype haplotype : callResult.getSecond().get(allele) ) {
|
||||
final int numVar = haplotype.getEventMap().size();
|
||||
if( numVar > maxNumVar ) { maxNumVar = numVar; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Calculate the event length
|
||||
int maxLength = 0;
|
||||
for ( final Allele a : annotatedCall.getAlternateAlleles() ) {
|
||||
final int length = a.length() - annotatedCall.getReference().length();
|
||||
if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
|
||||
}
|
||||
// Calculate the event length
|
||||
int maxLength = 0;
|
||||
for ( final Allele a : annotatedCall.getAlternateAlleles() ) {
|
||||
final int length = a.length() - annotatedCall.getReference().length();
|
||||
if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
|
||||
}
|
||||
|
||||
myAttributes.put("NVH", maxNumVar);
|
||||
myAttributes.put("NumHapEval", bestHaplotypes.size());
|
||||
myAttributes.put("NumHapAssembly", haplotypes.size());
|
||||
myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size());
|
||||
myAttributes.put("EVENTLENGTH", maxLength);
|
||||
myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") );
|
||||
myAttributes.put("extType", annotatedCall.getType().toString() );
|
||||
myAttributes.put("NVH", maxNumVar);
|
||||
myAttributes.put("NumHapEval", bestHaplotypes.size());
|
||||
myAttributes.put("NumHapAssembly", haplotypes.size());
|
||||
myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size());
|
||||
myAttributes.put("EVENTLENGTH", maxLength);
|
||||
myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") );
|
||||
myAttributes.put("extType", annotatedCall.getType().toString() );
|
||||
|
||||
//if( likelihoodCalculationEngine.haplotypeScore != null ) {
|
||||
// myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore));
|
||||
//}
|
||||
if( annotatedCall.hasAttribute("QD") ) {
|
||||
myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) );
|
||||
//if( likelihoodCalculationEngine.haplotypeScore != null ) {
|
||||
// myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore));
|
||||
//}
|
||||
if( annotatedCall.hasAttribute("QD") ) {
|
||||
myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) );
|
||||
}
|
||||
}
|
||||
|
||||
vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() );
|
||||
|
|
|
|||
|
|
@ -180,7 +180,6 @@ public class LikelihoodCalculationEngine {
|
|||
final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
|
||||
for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// log10(10^(a*x1) + 10^(b*x2)) ???
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -75,10 +75,11 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
Arrays.asList(params.md5));
|
||||
executeTest("testBQSR-"+params.args, spec).getFirst();
|
||||
|
||||
WalkerTestSpec specNT2 = new WalkerTestSpec(
|
||||
params.getCommandLine() + " -nt 2",
|
||||
Arrays.asList(params.md5));
|
||||
executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst();
|
||||
// TODO -- re-enable once parallelization is fixed in BaseRecalibrator
|
||||
//WalkerTestSpec specNT2 = new WalkerTestSpec(
|
||||
// params.getCommandLine() + " -nt 2",
|
||||
// Arrays.asList(params.md5));
|
||||
//executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst();
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
|||
|
|
@ -0,0 +1,62 @@
|
|||
package org.broadinstitute.sting.gatk.arguments;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Advanced;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 8/20/12
|
||||
* A collection of arguments that are common to the various callers.
|
||||
* This is pulled out so that every caller isn't exposed to the arguments from every other caller.
|
||||
*/
|
||||
|
||||
public class StandardCallerArgumentCollection {
|
||||
/**
|
||||
* The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are:
|
||||
* het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2
|
||||
*/
|
||||
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
|
||||
public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY;
|
||||
|
||||
@Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
|
||||
public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
|
||||
|
||||
@Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
|
||||
public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
|
||||
|
||||
/**
|
||||
* The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
|
||||
* confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
|
||||
* is the default).
|
||||
*/
|
||||
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false)
|
||||
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
|
||||
|
||||
/**
|
||||
* This argument allows you to emit low quality calls as filtered records.
|
||||
*/
|
||||
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false)
|
||||
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
|
||||
|
||||
/**
|
||||
* When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding
|
||||
*/
|
||||
@Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false)
|
||||
public RodBinding<VariantContext> alleles;
|
||||
|
||||
/**
|
||||
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
|
||||
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
|
||||
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
|
||||
* that you not play around with this parameter.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES = 3;
|
||||
}
|
||||
|
|
@ -23,19 +23,10 @@ import java.util.Map;
|
|||
/**
|
||||
* Total (unfiltered) depth over all samples.
|
||||
*
|
||||
* This and AD are complementary fields that are two important ways of thinking about the depth of the data for this sample
|
||||
* at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal
|
||||
* quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site.
|
||||
* The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the
|
||||
* REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
|
||||
* power I have to determine the genotype of the sample at this site, while the AD tells me how many times
|
||||
* I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
|
||||
* the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
|
||||
* to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
|
||||
* normally be excluded from the statistical calculations going into GQ and QUAL.
|
||||
*
|
||||
* Note that the DP is affected by downsampling (-dcov) though, so the max value one can obtain for N samples with
|
||||
* -dcov D is N * D
|
||||
* While the sample-level (FORMAT) DP field describes the total depth of reads that passed the Unified Genotyper's
|
||||
* internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth
|
||||
* over all samples. Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for
|
||||
* N samples with -dcov D is N * D
|
||||
*/
|
||||
public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -27,10 +27,10 @@ import java.util.Map;
|
|||
/**
|
||||
* The depth of coverage of each VCF allele in this sample.
|
||||
*
|
||||
* This and DP are complementary fields that are two important ways of thinking about the depth of the data for this sample
|
||||
* at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal
|
||||
* quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site.
|
||||
* The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the
|
||||
* The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this
|
||||
* sample at this site. While the sample-level (FORMAT) DP field describes the total depth of reads that passed the
|
||||
* Unified Genotyper's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of
|
||||
* REF and ALT fields) is the unfiltered count of all reads that carried with them the
|
||||
* REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
|
||||
* power I have to determine the genotype of the sample at this site, while the AD tells me how many times
|
||||
* I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
|
||||
|
|
@ -38,11 +38,11 @@ import java.util.Map;
|
|||
* to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
|
||||
* normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that
|
||||
* the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted.
|
||||
* Because of this fact, the sum of AD may be much lower than the individual sample depth, especially when there are
|
||||
* Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are
|
||||
* many non-informatice reads.
|
||||
* because the AD includes reads and bases that were filtered by the Unified Genotyper, <b>one should not base
|
||||
* assumptions about the underlying genotype based on it</b>; instead, the genotype likelihoods (PLs) are what
|
||||
* determine the genotype calls (see below).
|
||||
* Because the AD includes reads and bases that were filtered by the Unified Genotyper and in case of indels is based on a statistical computation,
|
||||
* <b>one should not base assumptions about the underlying genotype based on it</b>;
|
||||
* instead, the genotype likelihoods (PLs) are what determine the genotype calls.
|
||||
*/
|
||||
public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -136,6 +136,10 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
|
|||
*/
|
||||
public void initialize() {
|
||||
|
||||
// TODO -- remove me after the 2.1 release
|
||||
if ( getToolkit().getArguments().numberOfThreads > 1 )
|
||||
throw new UserException("We have temporarily disabled the ability to run BaseRecalibrator multi-threaded for performance reasons. We hope to have this fixed for the next GATK release (2.2) and apologize for the inconvenience.");
|
||||
|
||||
// check for unsupported access
|
||||
if (getToolkit().isGATKLite() && !getToolkit().getArguments().disableIndelQuals)
|
||||
throw new UserException.NotSupportedInGATKLite("base insertion/deletion recalibration is not supported, please use the --disable_indel_quals argument");
|
||||
|
|
|
|||
|
|
@ -26,11 +26,12 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
|
||||
public class UnifiedArgumentCollection {
|
||||
public class UnifiedArgumentCollection extends StandardCallerArgumentCollection {
|
||||
|
||||
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false)
|
||||
public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
|
||||
|
|
@ -42,13 +43,6 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
|
||||
protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
|
||||
|
||||
/**
|
||||
* The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are:
|
||||
* het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2
|
||||
*/
|
||||
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
|
||||
public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY;
|
||||
|
||||
/**
|
||||
* The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily
|
||||
* distinguish between PCR errors vs. sequencing errors. The practical implication for this value is that it
|
||||
|
|
@ -57,26 +51,6 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false)
|
||||
public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE;
|
||||
|
||||
@Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
|
||||
public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
|
||||
|
||||
@Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
|
||||
public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
|
||||
|
||||
/**
|
||||
* The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
|
||||
* confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
|
||||
* is the default).
|
||||
*/
|
||||
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false)
|
||||
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
|
||||
|
||||
/**
|
||||
* This argument allows you to emit low quality calls as filtered records.
|
||||
*/
|
||||
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false)
|
||||
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
|
||||
|
||||
/**
|
||||
* Note that calculating the SLOD increases the runtime by an appreciable amount.
|
||||
*/
|
||||
|
|
@ -90,12 +64,6 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false)
|
||||
public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false;
|
||||
|
||||
/**
|
||||
* When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding
|
||||
*/
|
||||
@Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false)
|
||||
public RodBinding<VariantContext> alleles;
|
||||
|
||||
/**
|
||||
* The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base
|
||||
* is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value.
|
||||
|
|
@ -107,16 +75,6 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||
public Double MAX_DELETION_FRACTION = 0.05;
|
||||
|
||||
/**
|
||||
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
|
||||
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
|
||||
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
|
||||
* that you not play around with this parameter.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES = 3;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "cap_max_alternate_alleles_for_indels", shortName = "capMaxAltAllelesForIndels", doc = "Cap the maximum number of alternate alleles to genotype for indel calls at 2; overrides the --max_alternate_alleles argument; GSA production use only", required = false)
|
||||
public boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = false;
|
||||
|
|
@ -139,7 +97,6 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false)
|
||||
public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25;
|
||||
|
||||
|
||||
/**
|
||||
* This argument informs the prior probability of having an indel at a site.
|
||||
*/
|
||||
|
|
@ -181,7 +138,6 @@ public class UnifiedArgumentCollection {
|
|||
Generalized ploidy argument (debug only): When building site error models, ignore lane information and build only
|
||||
sample-level error model
|
||||
*/
|
||||
|
||||
@Argument(fullName = "ignoreLaneInfo", shortName = "ignoreLane", doc = "Ignore lane when building error model, error model is then per-site", required = false)
|
||||
public boolean IGNORE_LANE_INFO = false;
|
||||
|
||||
|
|
@ -275,5 +231,16 @@ public class UnifiedArgumentCollection {
|
|||
return uac;
|
||||
}
|
||||
|
||||
public UnifiedArgumentCollection() { }
|
||||
|
||||
public UnifiedArgumentCollection( final StandardCallerArgumentCollection SCAC ) {
|
||||
super();
|
||||
this.alleles = SCAC.alleles;
|
||||
this.GenotypingMode = SCAC.GenotypingMode;
|
||||
this.heterozygosity = SCAC.heterozygosity;
|
||||
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
|
||||
this.OutputMode = SCAC.OutputMode;
|
||||
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -227,14 +227,21 @@ public class BaseUtils {
|
|||
}
|
||||
|
||||
@Deprecated
|
||||
static public boolean isRegularBase(char base) {
|
||||
static public boolean isRegularBase( final char base ) {
|
||||
return simpleBaseToBaseIndex(base) != -1;
|
||||
}
|
||||
|
||||
static public boolean isRegularBase(byte base) {
|
||||
static public boolean isRegularBase( final byte base ) {
|
||||
return simpleBaseToBaseIndex(base) != -1;
|
||||
}
|
||||
|
||||
static public boolean isAllRegularBases( final byte[] bases ) {
|
||||
for( final byte base : bases) {
|
||||
if( !isRegularBase(base) ) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static public boolean isNBase(byte base) {
|
||||
return base == 'N' || base == 'n';
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue