Have Haplotype extend the Allele class.

This way, we don't need to create a new Allele for every read/Haplotype pair to be placed in the PerReadAlleleLikelihoodMap (very inefficient).  Also, now we can easily get the Haplotype associated with the best allele for a given read.
This commit is contained in:
Eric Banks 2013-01-15 11:36:20 -05:00
parent 94800771e3
commit d3baa4b8ca
4 changed files with 93 additions and 65 deletions

View File

@ -47,7 +47,6 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import com.sun.corba.se.impl.logging.UtilSystemException;
import net.sf.samtools.*;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
@ -155,7 +154,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
protected StingSAMFileWriter bamWriter = null;
private SAMFileHeader bamHeader = null;
private long uniqueNameCounter = 1;
private final String readGroupId = "ArtificialHaplotype";
private final static String readGroupId = "ArtificialHaplotype";
/**
* The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
@ -338,20 +337,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS );
if ( bamWriter != null ) {
// prepare the bam header
bamHeader = new SAMFileHeader();
bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary());
final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>(1);
final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId);
rg.setSample("HC");
rg.setSequencingCenter("BI");
readGroups.add(rg);
bamHeader.setReadGroups(readGroups);
bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
bamWriter.writeHeader(bamHeader);
bamWriter.setPresorted(true);
}
if ( bamWriter != null )
setupBamWriter();
}
//---------------------------------------------------------------------------------------------------------------
@ -461,8 +448,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do!
finalizeActiveRegion( activeRegion ); // merge overlapping fragments, clip adapter and low qual tails
final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader)); // Create the reference haplotype which is the bases from the reference that make up the active region
referenceHaplotype.setIsReference(true);
final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region
final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING);
//int PRUNE_FACTOR = Math.max(MIN_PRUNE_FACTOR, determinePruneFactorFromCoverage( activeRegion ));
final ArrayList<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, getPaddedLoc(activeRegion), MIN_PRUNE_FACTOR, activeAllelesToGenotype );
@ -498,22 +484,19 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
}
if ( bamWriter != null ) {
// sort the haplotypes in coordinate order and then write them to the bam
Collections.sort( haplotypes, new Haplotype.HaplotypePositionComparator() );
final GenomeLoc paddedRefLoc = getPaddedLoc(activeRegion);
for ( Haplotype haplotype : haplotypes ) {
// TODO -- clean up this code
final GATKSAMRecord record = new GATKSAMRecord(bamHeader);
record.setReadBases(haplotype.getBases());
record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef());
record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length));
record.setCigar(haplotype.getCigar());
record.setMappingQuality(bestHaplotypes.contains(haplotype) ? 60 : 0);
record.setReadName("HC" + uniqueNameCounter++);
record.setReadUnmappedFlag(false);
record.setReferenceIndex(activeRegion.getReferenceLoc().getContigIndex());
record.setAttribute(SAMTag.RG.toString(), readGroupId);
record.setFlags(16);
bamWriter.addAlignment(record);
for ( Haplotype haplotype : haplotypes )
writeHaplotype(haplotype, paddedRefLoc, bestHaplotypes.contains(haplotype));
// now, output the interesting reads for each sample
for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) {
for ( Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue());
if ( bestAllele != Allele.NO_CALL )
writeReadAgainstHaplotype(entry.getKey(), (Haplotype)bestAllele);
}
}
}
@ -608,6 +591,46 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
}
return returnMap;
}
private void setupBamWriter() {
// prepare the bam header
bamHeader = new SAMFileHeader();
bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary());
bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
// include the original read groups plus a new artificial one for the haplotypes
final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>(getToolkit().getSAMFileHeader().getReadGroups());
final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId);
rg.setSample("HC");
rg.setSequencingCenter("BI");
readGroups.add(rg);
bamHeader.setReadGroups(readGroups);
bamWriter.writeHeader(bamHeader);
bamWriter.setPresorted(true);
}
private void writeHaplotype(final Haplotype haplotype, final GenomeLoc paddedRefLoc, final boolean isAmongBestHaplotypes) {
final GATKSAMRecord record = new GATKSAMRecord(bamHeader);
record.setReadBases(haplotype.getBases());
record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef());
record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length));
record.setCigar(haplotype.getCigar());
record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0);
record.setReadName("HC" + uniqueNameCounter++);
record.setReadUnmappedFlag(false);
record.setReferenceIndex(paddedRefLoc.getContigIndex());
record.setAttribute(SAMTag.RG.toString(), readGroupId);
record.setFlags(16);
bamWriter.addAlignment(record);
}
private void writeReadAgainstHaplotype(final GATKSAMRecord read, final Haplotype haplotype) {
}
/*

View File

@ -138,6 +138,7 @@ public class LikelihoodCalculationEngine {
readQuals[kkk] = ( readQuals[kkk] > (byte) read.getMappingQuality() ? (byte) read.getMappingQuality() : readQuals[kkk] ); // cap base quality by mapping quality
//readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated
//readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated
// TODO -- why is Q18 hard-coded here???
readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
}
@ -151,7 +152,7 @@ public class LikelihoodCalculationEngine {
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
previousHaplotypeSeen = haplotype;
perReadAlleleLikelihoodMap.add(read, Allele.create(haplotype.getBases()),
perReadAlleleLikelihoodMap.add(read, haplotype,
pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0));
}

View File

@ -37,59 +37,71 @@ import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.Serializable;
import java.util.*;
public class Haplotype {
protected final byte[] bases;
public class Haplotype extends Allele {
protected final double[] quals;
private GenomeLoc genomeLocation = null;
private HashMap<Integer, VariantContext> eventMap = null;
private boolean isRef = false;
private Cigar cigar;
private int alignmentStartHapwrtRef;
public int leftBreakPoint = 0;
public int rightBreakPoint = 0;
private Event artificialEvent = null;
/**
* Main constructor
*
* @param bases bases
* @param quals quals
* @param isRef is reference allele?
*/
public Haplotype( final byte[] bases, final double[] quals, final boolean isRef ) {
super(bases.clone(), isRef);
this.quals = quals.clone();
}
/**
* Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
*
* @param bases bases
* @param qual qual
*/
public Haplotype( final byte[] bases, final int qual ) {
this.bases = bases.clone();
public Haplotype( final byte[] bases, final int qual, final boolean isRef ) {
super(bases.clone(), isRef);
quals = new double[bases.length];
Arrays.fill(quals, (double)qual);
}
public Haplotype( final byte[] bases, final int qual ) {
this(bases, qual, false);
}
public Haplotype( final byte[] bases, final boolean isRef ) {
this(bases, 0, isRef);
}
public Haplotype( final byte[] bases, final double[] quals ) {
this.bases = bases.clone();
this.quals = quals.clone();
this(bases, quals, false);
}
public Haplotype( final byte[] bases ) {
this(bases, 0);
this(bases, 0, false);
}
protected Haplotype( final byte[] bases, final Event artificialEvent ) {
this(bases, 0);
this(bases, 0, false);
this.artificialEvent = artificialEvent;
}
public Haplotype( final byte[] bases, final GenomeLoc loc ) {
this(bases);
this(bases, 0, false);
this.genomeLocation = loc;
}
@Override
public boolean equals( Object h ) {
return h instanceof Haplotype && Arrays.equals(bases, ((Haplotype) h).bases);
return h instanceof Haplotype && super.equals(h);
}
@Override
public int hashCode() {
return Arrays.hashCode(bases);
}
public HashMap<Integer, VariantContext> getEventMap() {
return eventMap;
}
@ -98,17 +110,9 @@ public class Haplotype {
this.eventMap = eventMap;
}
public boolean isReference() {
return isRef;
}
public void setIsReference( boolean isRef ) {
this.isRef = isRef;
}
public double getQualitySum() {
double s = 0;
for (int k=0; k < bases.length; k++) {
for (int k=0; k < quals.length; k++) {
s += quals[k];
}
return s;
@ -116,14 +120,14 @@ public class Haplotype {
@Override
public String toString() {
return new String(bases);
return getDisplayString();
}
public double[] getQuals() {
return quals.clone();
}
public byte[] getBases() {
return bases.clone();
return super.getBases().clone();
}
public long getStartPosition() {
@ -178,13 +182,13 @@ public class Haplotype {
public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) {
// refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates
final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true);
if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype
if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= getBases().length ) { // desired change falls inside deletion so don't bother creating a new haplotype
return null;
}
byte[] newHaplotypeBases = new byte[]{};
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(getBases(), 0, haplotypeInsertLocation)); // bases before the variant
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant
newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(getBases(), haplotypeInsertLocation + refAllele.length(), getBases().length)); // bases after the variant
return new Haplotype(newHaplotypeBases, new Event(refAllele, altAllele, genomicInsertLocation));
}

View File

@ -111,7 +111,7 @@ public class Allele implements Comparable<Allele> {
/** A generic static NO_CALL allele for use */
// no public way to create an allele
private Allele(byte[] bases, boolean isRef) {
protected Allele(byte[] bases, boolean isRef) {
// null alleles are no longer allowed
if ( wouldBeNullAllele(bases) ) {
throw new IllegalArgumentException("Null alleles are not supported");
@ -140,7 +140,7 @@ public class Allele implements Comparable<Allele> {
throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'");
}
private Allele(String bases, boolean isRef) {
protected Allele(String bases, boolean isRef) {
this(bases.getBytes(), isRef);
}