Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
c255dd5917
|
|
@ -28,7 +28,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
||||||
*/
|
*/
|
||||||
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
|
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||||
|
|
||||||
private final Queue<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
|
private final LinkedList<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
|
||||||
private final LinkedHashSet<GATKSAMRecord> myReads = new LinkedHashSet<GATKSAMRecord>();
|
private final LinkedHashSet<GATKSAMRecord> myReads = new LinkedHashSet<GATKSAMRecord>();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
@ -112,7 +112,20 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
||||||
final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize );
|
final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize );
|
||||||
|
|
||||||
// add active regions to queue of regions to process
|
// add active regions to queue of regions to process
|
||||||
workQueue.addAll( activeRegions );
|
// first check if can merge active regions over shard boundaries
|
||||||
|
if( !activeRegions.isEmpty() ) {
|
||||||
|
if( !workQueue.isEmpty() ) {
|
||||||
|
final ActiveRegion last = workQueue.getLast();
|
||||||
|
final ActiveRegion first = activeRegions.get(0);
|
||||||
|
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
|
||||||
|
workQueue.removeLast();
|
||||||
|
activeRegions.remove(first);
|
||||||
|
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
workQueue.addAll( activeRegions );
|
||||||
|
}
|
||||||
|
|
||||||
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||||
|
|
||||||
// now go and process all of the active regions
|
// now go and process all of the active regions
|
||||||
|
|
|
||||||
|
|
@ -376,7 +376,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// indel likelihoods are stric log-probs, not phred scored
|
// indel likelihoods are strict log-probs, not phred scored
|
||||||
double overallScore = 0.0;
|
double overallScore = 0.0;
|
||||||
for (final double[] readHaplotypeScores : haplotypeScores) {
|
for (final double[] readHaplotypeScores : haplotypeScores) {
|
||||||
overallScore += MathUtils.arrayMin(readHaplotypeScores);
|
overallScore += MathUtils.arrayMin(readHaplotypeScores);
|
||||||
|
|
|
||||||
|
|
@ -419,7 +419,7 @@ public class UnifiedGenotyperEngine {
|
||||||
|
|
||||||
// if we are subsetting alleles (either because there were too many or because some were not polymorphic)
|
// if we are subsetting alleles (either because there were too many or because some were not polymorphic)
|
||||||
// then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
|
// then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
|
||||||
if ( myAlleles.size() != vc.getAlleles().size() )
|
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed
|
||||||
vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);
|
vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);
|
||||||
|
|
||||||
if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) {
|
if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) {
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,9 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.gatk.samples.Sample;
|
import org.broadinstitute.sting.gatk.samples.Sample;
|
||||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.MendelianViolation;
|
import org.broadinstitute.sting.utils.MendelianViolation;
|
||||||
import org.broadinstitute.sting.utils.SampleUtils;
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
|
|
@ -243,6 +246,16 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
|
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
|
||||||
protected boolean EXCLUDE_FILTERED = false;
|
protected boolean EXCLUDE_FILTERED = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This argument triggers re-genotyping of the selected samples through the Exact calculation model. Note that this is truly the
|
||||||
|
* mathematically correct way to select samples (especially when calls were generated from low coverage sequencing data); using the
|
||||||
|
* hard genotypes to select (i.e. the default mode of SelectVariants) can lead to false positives when errors are confused for variants
|
||||||
|
* in the original genotyping. We decided not to set the --regenotype option as the default though as the output can be unexpected if
|
||||||
|
* a user is strictly comparing against the original genotypes (GTs) in the file.
|
||||||
|
*/
|
||||||
|
@Argument(fullName="regenotype", shortName="regenotype", doc="re-genotype the selected samples based on their GLs (or PLs)", required=false)
|
||||||
|
protected Boolean REGENOTYPE = false;
|
||||||
|
private UnifiedGenotyperEngine UG_engine = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
|
* When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
|
||||||
|
|
@ -440,6 +453,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
SELECT_RANDOM_FRACTION = fractionRandom > 0;
|
SELECT_RANDOM_FRACTION = fractionRandom > 0;
|
||||||
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
|
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
|
||||||
|
|
||||||
|
if ( REGENOTYPE ) {
|
||||||
|
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||||
|
UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.BOTH;
|
||||||
|
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES;
|
||||||
|
UAC.NO_SLOD = true;
|
||||||
|
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||||
|
}
|
||||||
|
|
||||||
/** load in the IDs file to a hashset for matching */
|
/** load in the IDs file to a hashset for matching */
|
||||||
if ( rsIDFile != null ) {
|
if ( rsIDFile != null ) {
|
||||||
|
|
@ -519,7 +539,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
VariantContext sub = subsetRecord(vc, samples, EXCLUDE_NON_VARIANTS);
|
VariantContext sub = subsetRecord(vc, samples, EXCLUDE_NON_VARIANTS);
|
||||||
if ( (sub.isPolymorphicInSamples() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) {
|
|
||||||
|
if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) {
|
||||||
|
final VariantContextBuilder builder = new VariantContextBuilder(UG_engine.calculateGenotypes(tracker, ref, context, sub)).filters(sub.getFiltersMaybeNull());
|
||||||
|
addAnnotations(builder, sub);
|
||||||
|
sub = builder.make();
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) {
|
||||||
boolean failedJexlMatch = false;
|
boolean failedJexlMatch = false;
|
||||||
for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) {
|
for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) {
|
||||||
if ( !VariantContextUtils.match(sub, jexl) ) {
|
if ( !VariantContextUtils.match(sub, jexl) ) {
|
||||||
|
|
@ -728,6 +755,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
return recomputedVCs;
|
return recomputedVCs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean hasPLs(final VariantContext vc) {
|
||||||
|
for ( Genotype g : vc.getGenotypes() ) {
|
||||||
|
if ( g.hasLikelihoods() )
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if vc has a variant call for (at least one of) the samples.
|
* Checks if vc has a variant call for (at least one of) the samples.
|
||||||
* @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap)
|
* @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap)
|
||||||
|
|
@ -883,9 +918,26 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
|
|
||||||
builder.genotypes(newGC);
|
builder.genotypes(newGC);
|
||||||
|
|
||||||
|
addAnnotations(builder, sub);
|
||||||
|
|
||||||
|
return builder.make();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC) {
|
||||||
|
if (KEEP_ORIGINAL_CHR_COUNTS) {
|
||||||
|
if ( originalVC.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) )
|
||||||
|
builder.attribute("AC_Orig", originalVC.getAttribute(VCFConstants.ALLELE_COUNT_KEY));
|
||||||
|
if ( originalVC.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) )
|
||||||
|
builder.attribute("AF_Orig", originalVC.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY));
|
||||||
|
if ( originalVC.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) )
|
||||||
|
builder.attribute("AN_Orig", originalVC.getAttribute(VCFConstants.ALLELE_NUMBER_KEY));
|
||||||
|
}
|
||||||
|
|
||||||
|
VariantContextUtils.calculateChromosomeCounts(builder, false);
|
||||||
|
|
||||||
int depth = 0;
|
int depth = 0;
|
||||||
for (String sample : sub.getSampleNames()) {
|
for (String sample : originalVC.getSampleNames()) {
|
||||||
Genotype g = sub.getGenotype(sample);
|
Genotype g = originalVC.getGenotype(sample);
|
||||||
|
|
||||||
if ( g.isNotFiltered() ) {
|
if ( g.isNotFiltered() ) {
|
||||||
|
|
||||||
|
|
@ -895,21 +947,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (KEEP_ORIGINAL_CHR_COUNTS) {
|
|
||||||
if ( sub.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) )
|
|
||||||
builder.attribute("AC_Orig", sub.getAttribute(VCFConstants.ALLELE_COUNT_KEY));
|
|
||||||
if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) )
|
|
||||||
builder.attribute("AF_Orig", sub.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY));
|
|
||||||
if ( sub.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) )
|
|
||||||
builder.attribute("AN_Orig", sub.getAttribute(VCFConstants.ALLELE_NUMBER_KEY));
|
|
||||||
}
|
|
||||||
|
|
||||||
VariantContextUtils.calculateChromosomeCounts(builder, false);
|
|
||||||
builder.attribute("DP", depth);
|
builder.attribute("DP", depth);
|
||||||
|
|
||||||
return builder.make();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void randomlyAddVariant(int rank, VariantContext vc) {
|
private void randomlyAddVariant(int rank, VariantContext vc) {
|
||||||
|
|
|
||||||
|
|
@ -158,7 +158,9 @@ public class ActivityProfile {
|
||||||
// find the best place to break up the large active region
|
// find the best place to break up the large active region
|
||||||
Double minProb = Double.MAX_VALUE;
|
Double minProb = Double.MAX_VALUE;
|
||||||
int cutPoint = -1;
|
int cutPoint = -1;
|
||||||
for( int iii = curStart + 50; iii < curEnd - 50; iii++ ) { // BUGBUG: assumes maxRegionSize >> 50
|
|
||||||
|
final int size = curEnd - curStart + 1;
|
||||||
|
for( int iii = curStart + (int)(size*0.25); iii < curEnd - (int)(size*0.25); iii++ ) {
|
||||||
if( isActiveList.get(iii) < minProb ) { minProb = isActiveList.get(iii); cutPoint = iii; }
|
if( isActiveList.get(iii) < minProb ) { minProb = isActiveList.get(iii); cutPoint = iii; }
|
||||||
}
|
}
|
||||||
final List<ActiveRegion> leftList = createActiveRegion(isActive, curStart, cutPoint, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
|
final List<ActiveRegion> leftList = createActiveRegion(isActive, curStart, cutPoint, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
|
||||||
|
|
|
||||||
|
|
@ -116,6 +116,19 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
||||||
executeTest("testUsingDbsnpName--" + testFile, spec);
|
executeTest("testUsingDbsnpName--" + testFile, spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRegenotype() {
|
||||||
|
String testFile = validationDataLocation + "combine.3.vcf";
|
||||||
|
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
"-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s -NO_HEADER",
|
||||||
|
1,
|
||||||
|
Arrays.asList("0fd8e52bdcd1f4b921d8fb5c689f196a")
|
||||||
|
);
|
||||||
|
|
||||||
|
executeTest("testRegenotype--" + testFile, spec);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testMultipleRecordsAtOnePosition() {
|
public void testMultipleRecordsAtOnePosition() {
|
||||||
String testFile = validationDataLocation + "selectVariants.onePosition.vcf";
|
String testFile = validationDataLocation + "selectVariants.onePosition.vcf";
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue