1. Initial implementation of bam writing for the HaplotypeCaller with -bam argument; currently only assembled haplotypes are emitted.
2. Framework is set up in the VariantAnnotator for the HaplotypeCaller to be able to call in to annotate dbSNP plus comp RODs. Until the HC uses meta data though, this won't work.
This commit is contained in:
parent
39bc9e999d
commit
94800771e3
|
|
@ -52,6 +52,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -214,10 +215,10 @@ public class VariantAnnotatorEngine {
|
|||
Map<String, Object> infoAnnotations = new LinkedHashMap<String, Object>(vc.getAttributes());
|
||||
|
||||
// annotate db occurrences
|
||||
vc = annotateDBs(tracker, ref, vc, infoAnnotations);
|
||||
vc = annotateDBs(tracker, ref.getLocus(), vc, infoAnnotations);
|
||||
|
||||
// annotate expressions where available
|
||||
annotateExpressions(tracker, ref, infoAnnotations);
|
||||
annotateExpressions(tracker, ref.getLocus(), infoAnnotations);
|
||||
|
||||
// go through all the requested info annotationTypes
|
||||
for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) {
|
||||
|
|
@ -254,10 +255,22 @@ public class VariantAnnotatorEngine {
|
|||
return builder.genotypes(annotateGenotypes(null, null, null, vc, perReadAlleleLikelihoodMap)).make();
|
||||
}
|
||||
|
||||
private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
|
||||
public VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc) {
|
||||
final Map<String, Object> newInfoAnnotations = new HashMap<String, Object>(0);
|
||||
vc = annotateDBs(tracker, loc, vc, newInfoAnnotations);
|
||||
|
||||
if ( !newInfoAnnotations.isEmpty() ) {
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(newInfoAnnotations);
|
||||
vc = builder.make();
|
||||
}
|
||||
|
||||
return vc;
|
||||
}
|
||||
|
||||
private VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc, final Map<String, Object> infoAnnotations) {
|
||||
for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) {
|
||||
if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {
|
||||
final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
|
||||
final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), loc), vc.getType());
|
||||
|
||||
// add the ID if appropriate
|
||||
if ( rsID != null ) {
|
||||
|
|
@ -273,7 +286,7 @@ public class VariantAnnotatorEngine {
|
|||
}
|
||||
} else {
|
||||
boolean overlapsComp = false;
|
||||
for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) {
|
||||
for ( VariantContext comp : tracker.getValues(dbSet.getKey(), loc) ) {
|
||||
if ( !comp.isFiltered() && ( !requireStrictAlleleMatch || comp.getAlleles().equals(vc.getAlleles()) ) ) {
|
||||
overlapsComp = true;
|
||||
break;
|
||||
|
|
@ -287,9 +300,9 @@ public class VariantAnnotatorEngine {
|
|||
return vc;
|
||||
}
|
||||
|
||||
private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, Object> infoAnnotations) {
|
||||
private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final Map<String, Object> infoAnnotations) {
|
||||
for ( VAExpression expression : requestedExpressions ) {
|
||||
Collection<VariantContext> VCs = tracker.getValues(expression.binding, ref.getLocus());
|
||||
Collection<VariantContext> VCs = tracker.getValues(expression.binding, loc);
|
||||
if ( VCs.size() == 0 )
|
||||
continue;
|
||||
|
||||
|
|
|
|||
|
|
@ -47,6 +47,8 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.sun.corba.se.impl.logging.UtilSystemException;
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
|
|
@ -57,6 +59,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
|
|
@ -67,6 +70,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection
|
|||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
|
|
@ -142,6 +146,17 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false)
|
||||
protected PrintStream graphWriter = null;
|
||||
|
||||
/**
|
||||
* The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. Note that the output here
|
||||
* does not include uninformative reads so that not every input read is emitted to the bam.
|
||||
*/
|
||||
@Hidden
|
||||
@Output(fullName="bamOutput", shortName="bam", doc="File to which assembled haplotypes should be written", required = false)
|
||||
protected StingSAMFileWriter bamWriter = null;
|
||||
private SAMFileHeader bamHeader = null;
|
||||
private long uniqueNameCounter = 1;
|
||||
private final String readGroupId = "ArtificialHaplotype";
|
||||
|
||||
/**
|
||||
* The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
|
||||
*/
|
||||
|
|
@ -242,6 +257,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
// the genotyping engine
|
||||
private GenotypingEngine genotypingEngine = null;
|
||||
|
||||
private VariantAnnotatorEngine annotationEngine = null;
|
||||
|
||||
// fasta reference reader to supplement the edges of the reference sequence
|
||||
private CachingIndexedFastaSequenceFile referenceReader;
|
||||
|
||||
|
|
@ -286,7 +303,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
// initialize the output VCF header
|
||||
final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||
|
||||
|
|
@ -320,6 +337,21 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter, minKmer );
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS );
|
||||
|
||||
if ( bamWriter != null ) {
|
||||
// prepare the bam header
|
||||
bamHeader = new SAMFileHeader();
|
||||
bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary());
|
||||
final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>(1);
|
||||
final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId);
|
||||
rg.setSample("HC");
|
||||
rg.setSequencingCenter("BI");
|
||||
readGroups.add(rg);
|
||||
bamHeader.setReadGroups(readGroups);
|
||||
bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
|
||||
bamWriter.writeHeader(bamHeader);
|
||||
bamWriter.setPresorted(true);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -408,7 +440,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public Integer map( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) {
|
||||
public Integer map( final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) {
|
||||
if ( justDetermineActiveRegions )
|
||||
// we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work
|
||||
return 1;
|
||||
|
|
@ -461,9 +493,30 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
activeRegion.getLocation(),
|
||||
getToolkit().getGenomeLocParser(),
|
||||
activeAllelesToGenotype ) ) {
|
||||
annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call);
|
||||
vcfWriter.add( call );
|
||||
}
|
||||
|
||||
if ( bamWriter != null ) {
|
||||
Collections.sort( haplotypes, new Haplotype.HaplotypePositionComparator() );
|
||||
final GenomeLoc paddedRefLoc = getPaddedLoc(activeRegion);
|
||||
for ( Haplotype haplotype : haplotypes ) {
|
||||
// TODO -- clean up this code
|
||||
final GATKSAMRecord record = new GATKSAMRecord(bamHeader);
|
||||
record.setReadBases(haplotype.getBases());
|
||||
record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef());
|
||||
record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length));
|
||||
record.setCigar(haplotype.getCigar());
|
||||
record.setMappingQuality(bestHaplotypes.contains(haplotype) ? 60 : 0);
|
||||
record.setReadName("HC" + uniqueNameCounter++);
|
||||
record.setReadUnmappedFlag(false);
|
||||
record.setReferenceIndex(activeRegion.getReferenceLoc().getContigIndex());
|
||||
record.setAttribute(SAMTag.RG.toString(), readGroupId);
|
||||
record.setFlags(16);
|
||||
bamWriter.addAlignment(record);
|
||||
}
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); }
|
||||
|
||||
return 1; // One active region was processed during this map call
|
||||
|
|
|
|||
|
|
@ -75,6 +75,11 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15");
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
public void testHaplotypeCallerSingleSampleWithDbsnp() {
|
||||
HCTest(NA12878_BAM, "-D " + b37dbSNP132, "");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGA() {
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
|
||||
|
|
|
|||
|
|
@ -191,6 +191,10 @@ public class Haplotype {
|
|||
public static class HaplotypeBaseComparator implements Comparator<Haplotype>, Serializable {
|
||||
@Override
|
||||
public int compare( final Haplotype hap1, final Haplotype hap2 ) {
|
||||
return compareHaplotypeBases(hap1, hap2);
|
||||
}
|
||||
|
||||
public static int compareHaplotypeBases(final Haplotype hap1, final Haplotype hap2) {
|
||||
final byte[] arr1 = hap1.getBases();
|
||||
final byte[] arr2 = hap2.getBases();
|
||||
// compares byte arrays using lexical ordering
|
||||
|
|
@ -203,6 +207,14 @@ public class Haplotype {
|
|||
}
|
||||
}
|
||||
|
||||
public static class HaplotypePositionComparator implements Comparator<Haplotype>, Serializable {
|
||||
@Override
|
||||
public int compare( final Haplotype hap1, final Haplotype hap2 ) {
|
||||
final int comp = hap1.getAlignmentStartHapwrtRef() - hap2.getAlignmentStartHapwrtRef();
|
||||
return comp == 0 ? HaplotypeBaseComparator.compareHaplotypeBases(hap1, hap2) : comp;
|
||||
}
|
||||
}
|
||||
|
||||
public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(final List<Allele> alleleList,
|
||||
final int startPos,
|
||||
final ReferenceContext ref,
|
||||
|
|
|
|||
Loading…
Reference in New Issue