Merge branch 'master' of ssh://gsa2.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Ryan Poplin 2012-06-18 08:51:48 -04:00
commit 5ec737f008
181 changed files with 108461 additions and 4276 deletions

View File

@ -2,6 +2,7 @@ library(gsalib)
library(ggplot2) library(ggplot2)
library(gplots) library(gplots)
library(tools) library(tools)
library(reshape)
# #
# Standard command line switch. Can we loaded interactively for development # Standard command line switch. Can we loaded interactively for development

View File

@ -59,6 +59,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
@ -221,6 +222,10 @@ public class GenomeAnalysisEngine {
if (this.getArguments().nonDeterministicRandomSeed) if (this.getArguments().nonDeterministicRandomSeed)
resetRandomGenerator(System.currentTimeMillis()); resetRandomGenerator(System.currentTimeMillis());
// TODO -- REMOVE ME WHEN WE STOP BCF testing
if ( this.getArguments().USE_SLOW_GENOTYPES )
GenotypeBuilder.MAKE_FAST_BY_DEFAULT = false;
// if the use specified an input BQSR recalibration table then enable on the fly recalibration // if the use specified an input BQSR recalibration table then enable on the fly recalibration
if (this.getArguments().BQSR_RECAL_FILE != null) if (this.getArguments().BQSR_RECAL_FILE != null)
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels); setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels);

View File

@ -51,11 +51,6 @@ public class ReadProperties {
return includeReadsWithDeletionAtLoci; return includeReadsWithDeletionAtLoci;
} }
@Deprecated
public boolean generateExtendedEvents() {
return false;
}
/** /**
* Gets a list of the files acting as sources of reads. * Gets a list of the files acting as sources of reads.
* @return A list of files storing reads data. * @return A list of files storing reads data.

View File

@ -336,6 +336,11 @@ public class GATKArgumentCollection {
public boolean generateShadowBCF = false; public boolean generateShadowBCF = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
@Argument(fullName="useSlowGenotypes",shortName = "useSlowGenotypes",doc="",required=false)
@Hidden
public boolean USE_SLOW_GENOTYPES = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
/** /**
* The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file * The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file
* and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other * and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other

View File

@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.contexts;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.HasGenomeLocation;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -89,36 +88,9 @@ public class AlignmentContext implements HasGenomeLocation {
* @return * @return
*/ */
public ReadBackedPileup getBasePileup() { public ReadBackedPileup getBasePileup() {
if(!hasBasePileup())
throw new ReviewedStingException("No base pileup is available. Please check for a base pileup with hasBasePileup() before attempting to retrieve a pileup.");
return basePileup; return basePileup;
} }
/** Returns extended event (indel) pileup over the current genomic location. May return null if this context keeps
* only base pileup.
* @return
*/
@Deprecated
public ReadBackedExtendedEventPileup getExtendedEventPileup() {
if(!hasExtendedEventPileup())
throw new ReviewedStingException("No extended event pileup is present.");
return (ReadBackedExtendedEventPileup)basePileup;
}
/**
* Returns true if this alignment context keeps base pileup over the current genomic location.
* TODO: Syntax of AlignmentContext uses hasBasePileup() / hasExtendedEventPileup() as an enumeration mechanism. Change this to a more sensible interface.
* @return
*/
public boolean hasBasePileup() { return !(basePileup instanceof ReadBackedExtendedEventPileup); }
/** Returns true if this alignment context keeps extended event (indel) pileup over the current genomic location.
*
* @return
*/
@Deprecated
public boolean hasExtendedEventPileup() { return basePileup instanceof ReadBackedExtendedEventPileup; }
/** /**
* Returns true if any reads have been filtered out of the pileup due to excess DoC. * Returns true if any reads have been filtered out of the pileup due to excess DoC.
* @return True if reads have been filtered out. False otherwise. * @return True if reads have been filtered out. False otherwise.

View File

@ -116,19 +116,15 @@ public class AlignmentContextUtils {
* *
**/ **/
public static Map<SAMReadGroupRecord, AlignmentContext> splitContextByReadGroup(AlignmentContext context, Collection<SAMReadGroupRecord> readGroups) { public static Map<SAMReadGroupRecord, AlignmentContext> splitContextByReadGroup(AlignmentContext context, Collection<SAMReadGroupRecord> readGroups) {
if ( ! context.hasBasePileup() ) { HashMap<SAMReadGroupRecord, AlignmentContext> contexts = new HashMap<SAMReadGroupRecord, AlignmentContext>();
return Collections.emptyMap();
} else {
HashMap<SAMReadGroupRecord, AlignmentContext> contexts = new HashMap<SAMReadGroupRecord, AlignmentContext>();
for (SAMReadGroupRecord rg : readGroups) { for (SAMReadGroupRecord rg : readGroups) {
ReadBackedPileup rgPileup = context.getBasePileup().getPileupForReadGroup(rg.getReadGroupId()); ReadBackedPileup rgPileup = context.getBasePileup().getPileupForReadGroup(rg.getReadGroupId());
if ( rgPileup != null ) // there we some reads for RG if ( rgPileup != null ) // there we some reads for RG
contexts.put(rg, new AlignmentContext(context.getLocation(), rgPileup)); contexts.put(rg, new AlignmentContext(context.getLocation(), rgPileup));
}
return contexts;
} }
return contexts;
} }
public static Map<String, AlignmentContext> splitContextBySampleName(ReadBackedPileup pileup) { public static Map<String, AlignmentContext> splitContextBySampleName(ReadBackedPileup pileup) {
@ -139,32 +135,16 @@ public class AlignmentContextUtils {
public static AlignmentContext joinContexts(Collection<AlignmentContext> contexts) { public static AlignmentContext joinContexts(Collection<AlignmentContext> contexts) {
// validation // validation
GenomeLoc loc = contexts.iterator().next().getLocation(); GenomeLoc loc = contexts.iterator().next().getLocation();
boolean isExtended = contexts.iterator().next().basePileup instanceof ReadBackedExtendedEventPileup;
for(AlignmentContext context: contexts) { for(AlignmentContext context: contexts) {
if(!loc.equals(context.getLocation())) if(!loc.equals(context.getLocation()))
throw new ReviewedStingException("Illegal attempt to join contexts from different genomic locations"); throw new ReviewedStingException("Illegal attempt to join contexts from different genomic locations");
if(isExtended != (context.basePileup instanceof ReadBackedExtendedEventPileup))
throw new ReviewedStingException("Illegal attempt to join simple and extended contexts");
} }
AlignmentContext jointContext; List<PileupElement> pe = new ArrayList<PileupElement>();
if(isExtended) { for(AlignmentContext context: contexts) {
List<ExtendedEventPileupElement> pe = new ArrayList<ExtendedEventPileupElement>(); for(PileupElement pileupElement: context.basePileup)
for(AlignmentContext context: contexts) { pe.add(pileupElement);
for(PileupElement pileupElement: context.basePileup)
pe.add((ExtendedEventPileupElement)pileupElement);
}
jointContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc,pe));
} }
else { return new AlignmentContext(loc, new ReadBackedPileupImpl(loc,pe));
List<PileupElement> pe = new ArrayList<PileupElement>();
for(AlignmentContext context: contexts) {
for(PileupElement pileupElement: context.basePileup)
pe.add(pileupElement);
}
jointContext = new AlignmentContext(loc, new ReadBackedPileupImpl(loc,pe));
}
return jointContext;
} }
} }

View File

@ -0,0 +1,76 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import java.util.Collection;
import java.util.List;
/**
* The basic downsampler API, with no reads-specific operations
*
* @author David Roazen
*/
public interface Downsampler<T> {
/*
* Submit one item to the downsampler for consideration . Some downsamplers will be able to determine
* immediately whether the item survives the downsampling process, while others will need to see
* more items before making that determination.
*/
public void submit( T item );
/*
* Submit a collection of items to the downsampler for consideration.
*/
public void submit( Collection<T> items );
/*
* Are there items that have survived the downsampling process waiting to be retrieved?
*/
public boolean hasDownsampledItems();
/*
* Return (and remove) all items that have survived downsampling and are waiting to be retrieved.
*/
public List<T> consumeDownsampledItems();
/*
* Are there items stored in this downsampler that it doesn't yet know whether they will
* ultimately survive the downsampling process?
*/
public boolean hasPendingItems();
/*
* Used to tell the downsampler that no more items will be submitted to it, and that it should
* finalize any pending items.
*/
public void signalEndOfInput();
/*
* Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state
* information.
*/
public void clear();
}

View File

@ -0,0 +1,98 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import java.util.Collection;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* StingSAMIterator wrapper around our generic reads downsampler interface
*
* @author David Roazen
*/
public class DownsamplingReadsIterator implements StingSAMIterator {
private StingSAMIterator nestedSAMIterator;
private ReadsDownsampler<SAMRecord> downsampler;
private Collection<SAMRecord> downsampledReadsCache;
private Iterator<SAMRecord> downsampledReadsCacheIterator;
public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler<SAMRecord> downsampler ) {
nestedSAMIterator = iter;
this.downsampler = downsampler;
fillDownsampledReadsCache();
}
public boolean hasNext() {
if ( downsampledReadsCacheIterator.hasNext() ) {
return true;
}
else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) {
return false;
}
return true;
}
public SAMRecord next() {
if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) {
throw new NoSuchElementException("next() called when there are no more items");
}
return downsampledReadsCacheIterator.next();
}
private boolean fillDownsampledReadsCache() {
while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) {
downsampler.submit(nestedSAMIterator.next());
}
if ( ! nestedSAMIterator.hasNext() ) {
downsampler.signalEndOfInput();
}
downsampledReadsCache = downsampler.consumeDownsampledItems();
downsampledReadsCacheIterator = downsampledReadsCache.iterator();
return downsampledReadsCacheIterator.hasNext();
}
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
public void close() {
nestedSAMIterator.close();
}
public Iterator<SAMRecord> iterator() {
return this;
}
}

View File

@ -0,0 +1,94 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* Fractional Downsampler: selects a specified fraction of the reads for inclusion
*
* @author David Roazen
*/
public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private ArrayList<T> selectedReads;
private int cutoffForInclusion;
private static final int RANDOM_POOL_SIZE = 10000;
public FractionalDownsampler( double fraction ) {
if ( fraction < 0.0 || fraction > 1.0 ) {
throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive");
}
cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE);
clear();
}
public void submit( T newRead ) {
if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) {
selectedReads.add(newRead);
}
}
public void submit( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasDownsampledItems() {
return selectedReads.size() > 0;
}
public List<T> consumeDownsampledItems() {
List<T> downsampledItems = selectedReads;
clear();
return downsampledItems;
}
public boolean hasPendingItems() {
return false;
}
public void signalEndOfInput() {
// NO-OP
}
public void clear() {
selectedReads = new ArrayList<T>();
}
public boolean requiresCoordinateSortOrder() {
return false;
}
}

View File

@ -0,0 +1,259 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
/**
* Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions
*
* @author David Roazen
*/
public class PositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private int targetCoverage;
private ReservoirDownsampler<T> reservoir;
private int currentContigIndex;
private int currentAlignmentStart;
private LinkedList<PositionalReadGrouping> pendingReads;
private ArrayList<T> finalizedReads;
public PositionalDownsampler ( int targetCoverage ) {
this.targetCoverage = targetCoverage;
clear();
}
public void submit ( T newRead ) {
if ( readIsPastCurrentPosition(newRead) ) {
updateAndDownsamplePendingReads();
}
reservoir.submit(newRead);
updateCurrentPosition(newRead);
}
public void submit ( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasDownsampledItems() {
return finalizedReads.size() > 0;
}
public List<T> consumeDownsampledItems() {
List<T> toReturn = finalizedReads;
finalizedReads = new ArrayList<T>();
return toReturn;
}
public boolean hasPendingItems() {
return pendingReads.size() > 0;
}
public void signalEndOfInput() {
updateAndDownsamplePendingReads();
for ( PositionalReadGrouping group : pendingReads ) {
group.finalizeAllActiveReads();
finalizedReads.addAll(group.getFinalizedReads());
}
pendingReads.clear();
}
public void clear() {
reservoir = new ReservoirDownsampler<T>(targetCoverage);
pendingReads = new LinkedList<PositionalReadGrouping>();
finalizedReads = new ArrayList<T>();
}
public boolean requiresCoordinateSortOrder() {
return true;
}
private void updateCurrentPosition ( T read ) {
currentContigIndex = read.getReferenceIndex();
currentAlignmentStart = read.getAlignmentStart();
}
private boolean readIsPastCurrentPosition ( T read ) {
return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart;
}
private void updateAndDownsamplePendingReads() {
finalizeOutOfScopeReads();
List<T> oldLocusReads = reservoir.consumeDownsampledItems();
pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart));
downsampleOverlappingGroups();
}
private void finalizeOutOfScopeReads() {
Iterator<PositionalReadGrouping> iter = pendingReads.iterator();
boolean noPrecedingUnfinalizedGroups = true;
while ( iter.hasNext() ) {
PositionalReadGrouping currentGroup = iter.next();
currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart);
if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) {
iter.remove();
finalizedReads.addAll(currentGroup.getFinalizedReads());
}
else {
noPrecedingUnfinalizedGroups = false;
}
}
}
private void downsampleOverlappingGroups() {
int[] groupReadCounts = new int[pendingReads.size()];
int totalCoverage = 0;
int numActiveGroups = 0;
int currentGroup = 0;
for ( PositionalReadGrouping group : pendingReads ) {
groupReadCounts[currentGroup] = group.numActiveReads();
totalCoverage += groupReadCounts[currentGroup];
if ( groupReadCounts[currentGroup] > 0 ) {
numActiveGroups++;
}
currentGroup++;
}
if ( totalCoverage <= targetCoverage ) {
return;
}
int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups);
currentGroup = 0;
while ( numReadsToRemove > 0 ) {
if ( groupReadCounts[currentGroup] > 1 ) {
groupReadCounts[currentGroup]--;
numReadsToRemove--;
}
currentGroup = (currentGroup + 1) % groupReadCounts.length;
}
currentGroup = 0;
for ( PositionalReadGrouping group : pendingReads ) {
if ( ! group.isFinalized() ) {
group.downsampleActiveReads(groupReadCounts[currentGroup]);
}
currentGroup++;
}
}
private class PositionalReadGrouping {
private List<T> activeReads;
private List<T> finalizedReads;
private int contig;
private int alignmentStart;
public PositionalReadGrouping( Collection<T> reads, int contig, int alignmentStart ) {
activeReads = new LinkedList<T>(reads);
finalizedReads = new ArrayList<T>();
this.contig = contig;
this.alignmentStart = alignmentStart;
}
public int numActiveReads() {
return activeReads.size();
}
public boolean isFinalized() {
return activeReads.size() == 0;
}
public List<T> getFinalizedReads() {
return finalizedReads;
}
public void finalizeActiveReadsBeforePosition( int contig, int position ) {
if ( this.contig != contig ) {
finalizeAllActiveReads();
return;
}
Iterator<T> iter = activeReads.iterator();
while ( iter.hasNext() ) {
T read = iter.next();
if ( read.getAlignmentEnd() < position ) {
iter.remove();
finalizedReads.add(read);
}
}
}
public void finalizeAllActiveReads() {
finalizedReads.addAll(activeReads);
activeReads.clear();
}
public void downsampleActiveReads( int numReadsToKeep ) {
if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) {
throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads",
numReadsToKeep, activeReads.size()));
}
BitSet itemsToKeep = new BitSet(activeReads.size());
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) {
itemsToKeep.set(selectedIndex);
}
int currentIndex = 0;
Iterator<T> iter = activeReads.iterator();
while ( iter.hasNext() ) {
T read = iter.next();
if ( ! itemsToKeep.get(currentIndex) ) {
iter.remove();
}
currentIndex++;
}
}
}
}

View File

@ -0,0 +1,40 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* An extension of the basic downsampler API with reads-specific operations
*
* @author David Roazen
*/
public interface ReadsDownsampler<T extends SAMRecord> extends Downsampler<T> {
/*
* Does this downsampler require that reads be fed to it in coordinate order?
*/
public boolean requiresCoordinateSortOrder();
}

View File

@ -0,0 +1,106 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with
* every read in the stream having an equal chance of being selected for inclusion.
*
* An implementation of "Algorithm R" from the paper "Random Sampling with a Reservoir" (Jeffrey Scott Vitter, 1985)
*
* @author David Roazen
*/
public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private ArrayList<T> reservoir;
private int targetSampleSize;
private int totalReadsSeen;
public ReservoirDownsampler ( int targetSampleSize ) {
if ( targetSampleSize <= 0 ) {
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
}
this.targetSampleSize = targetSampleSize;
clear();
}
public void submit ( T newRead ) {
totalReadsSeen++;
if ( totalReadsSeen <= targetSampleSize ) {
reservoir.add(newRead);
}
else {
int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen);
if ( randomSlot < targetSampleSize ) {
reservoir.set(randomSlot, newRead);
}
}
}
public void submit ( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasDownsampledItems() {
return reservoir.size() > 0;
}
public List<T> consumeDownsampledItems() {
List<T> downsampledItems = reservoir;
clear();
return downsampledItems;
}
public boolean hasPendingItems() {
return false;
}
public void signalEndOfInput() {
// NO-OP
}
public void clear() {
reservoir = new ArrayList<T>(targetSampleSize);
totalReadsSeen = 0;
}
public boolean requiresCoordinateSortOrder() {
return false;
}
}

View File

@ -38,10 +38,7 @@ import org.broadinstitute.sting.utils.variantcontext.writer.Options;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
import java.io.File; import java.io.*;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.Arrays; import java.util.Arrays;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.List; import java.util.List;
@ -58,6 +55,8 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
*/ */
private static Logger logger = Logger.getLogger(VariantContextWriterStorage.class); private static Logger logger = Logger.getLogger(VariantContextWriterStorage.class);
private final static int BUFFER_SIZE = 1048576;
protected final File file; protected final File file;
protected OutputStream stream; protected OutputStream stream;
protected final VariantContextWriter writer; protected final VariantContextWriter writer;
@ -93,7 +92,7 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
if ( stub.isCompressed() ) if ( stub.isCompressed() )
stream = new BlockCompressedOutputStream(file); stream = new BlockCompressedOutputStream(file);
else else
stream = new PrintStream(file); stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE));
} }
catch(IOException ex) { catch(IOException ex) {
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex); throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex);

View File

@ -51,6 +51,8 @@ import java.util.List;
* @version 0.1 * @version 0.1
*/ */
public class VariantContextWriterStub implements Stub<VariantContextWriter>, VariantContextWriter { public class VariantContextWriterStub implements Stub<VariantContextWriter>, VariantContextWriter {
public final static boolean UPDATE_CONTIG_HEADERS = true;
/** /**
* The engine, central to the GATK's processing. * The engine, central to the GATK's processing.
*/ */
@ -215,7 +217,8 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
vcfHeader.addMetaDataLine(commandLineArgHeaderLine); vcfHeader.addMetaDataLine(commandLineArgHeaderLine);
} }
//vcfHeader = VCFUtils.withUpdatedContigs(vcfHeader, engine); if ( UPDATE_CONTIG_HEADERS )
vcfHeader = VCFUtils.withUpdatedContigs(vcfHeader, engine);
} }
outputTracker.getStorage(this).writeHeader(vcfHeader); outputTracker.getStorage(this).writeHeader(vcfHeader);

View File

@ -40,9 +40,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.ReservoirDownsampler; import org.broadinstitute.sting.utils.ReservoirDownsampler;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileupImpl;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -63,7 +61,6 @@ public class LocusIteratorByState extends LocusIterator {
// member fields // member fields
// //
// ----------------------------------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------------------------------
private boolean hasExtendedEvents = false; // will be set to true if at least one read had an indel right before the current position
/** /**
* Used to create new GenomeLocs. * Used to create new GenomeLocs.
@ -92,26 +89,10 @@ public class LocusIteratorByState extends LocusIterator {
// stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended // stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
// events immediately preceding the current reference base). // events immediately preceding the current reference base).
boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases? public SAMRecordState(SAMRecord read) {
// the only purpose of this flag is to shield away a few additional lines of code
// when extended piles are not needed, it may not be even worth it...
byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels)
int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events
byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the
// current base on the ref. We use a counter-like variable here since clearing the indel event is
// delayed by one base, so we need to remember how long ago we have seen the actual event
int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the
// event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly,
// we cache it here mainly for convenience
public SAMRecordState(SAMRecord read, boolean extended) {
this.read = read; this.read = read;
cigar = read.getCigar(); cigar = read.getCigar();
nCigarElements = cigar.numCigarElements(); nCigarElements = cigar.numCigarElements();
generateExtendedEvents = extended;
//System.out.printf("Creating a SAMRecordState: %s%n", this); //System.out.printf("Creating a SAMRecordState: %s%n", this);
} }
@ -150,27 +131,6 @@ public class LocusIteratorByState extends LocusIterator {
return curElement.getOperator(); return curElement.getOperator();
} }
/**
* Returns true if we just stepped over insertion/into a deletion prior to the last return from stepForwardOnGenome.
*
* @return
*/
public boolean hadIndel() {
return (eventLength > 0);
}
public int getEventLength() {
return eventLength;
}
public byte[] getEventBases() {
return insertedBases;
}
public int getReadEventStartOffset() {
return eventStart;
}
public String toString() { public String toString() {
return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement); return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
} }
@ -208,19 +168,6 @@ public class LocusIteratorByState extends LocusIterator {
genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here: genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
// we do step forward on the ref, and by returning null we also indicate that we are past the read end. // we do step forward on the ref, and by returning null we also indicate that we are past the read end.
if (generateExtendedEvents && eventDelayedFlag > 0) {
// if we had an indel right before the read ended (i.e. insertion was the last cigar element),
// we keep it until next reference base; then we discard it and this will allow the LocusIterator to
// finally discard this read
eventDelayedFlag--;
if (eventDelayedFlag == 0) {
eventLength = -1; // reset event when we are past it
insertedBases = null;
eventStart = -1;
}
}
return null; return null;
} }
} }
@ -232,17 +179,6 @@ public class LocusIteratorByState extends LocusIterator {
cigarElementCounter = curElement.getLength(); cigarElementCounter = curElement.getLength();
break; break;
case I: // insertion w.r.t. the reference case I: // insertion w.r.t. the reference
if (generateExtendedEvents) {
// we see insertions only once, when we step right onto them; the position on the read is scrolled
// past the insertion right after that
if (eventDelayedFlag > 1)
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength());
eventLength = curElement.getLength();
eventStart = readOffset;
eventDelayedFlag = 2; // insertion causes re-entry into stepForwardOnGenome, so we set the delay to 2
// System.out.println("Inserted "+(new String (insertedBases)) +" after "+readOffset);
} // continue onto the 'S' case !
case S: // soft clip case S: // soft clip
cigarElementCounter = curElement.getLength(); cigarElementCounter = curElement.getLength();
readOffset += curElement.getLength(); readOffset += curElement.getLength();
@ -250,19 +186,6 @@ public class LocusIteratorByState extends LocusIterator {
case D: // deletion w.r.t. the reference case D: // deletion w.r.t. the reference
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar"); throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
if (generateExtendedEvents) {
if (cigarElementCounter == 1) {
// generate an extended event only if we just stepped into the deletion (i.e. don't
// generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!)
if (eventDelayedFlag > 1)
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
eventLength = curElement.getLength();
eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only
eventStart = readOffset;
insertedBases = null;
// System.out.println("Deleted "+eventLength +" bases after "+readOffset);
}
}
// should be the same as N case // should be the same as N case
genomeOffset++; genomeOffset++;
done = true; done = true;
@ -280,21 +203,6 @@ public class LocusIteratorByState extends LocusIterator {
throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator()); throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
} }
if (generateExtendedEvents) {
if (eventDelayedFlag > 0 && done) {
// if we did make a successful step on the ref, decrement delayed flag. If, upon the decrementing the,
// the flag is 1, we are standing on the reference base right after the indel (so we have to keep it).
// Otherwise, we are away from the previous indel and have to clear our memories...
eventDelayedFlag--; // when we notice an indel, we set delayed flag to 2, so now
// if eventDelayedFlag == 1, an indel occured right before the current base
if (eventDelayedFlag == 0) {
eventLength = -1; // reset event when we are past it
insertedBases = null;
eventStart = -1;
}
}
}
return done ? curElement.getOperator() : stepForwardOnGenome(); return done ? curElement.getOperator() : stepForwardOnGenome();
} }
} }
@ -374,147 +282,69 @@ public class LocusIteratorByState extends LocusIterator {
// this call will set hasExtendedEvents to true if it picks up a read with indel right before the current position on the ref: // this call will set hasExtendedEvents to true if it picks up a read with indel right before the current position on the ref:
readStates.collectPendingReads(); readStates.collectPendingReads();
int size = 0; final GenomeLoc location = getLocation();
int nDeletions = 0; final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
int nInsertions = 0; boolean hasBeenSampled = false;
int nMQ0Reads = 0; for (final String sample : samples) {
final Iterator<SAMRecordState> iterator = readStates.iterator(sample);
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
int size = 0; // number of elements in this sample's pileup
int nDeletions = 0; // number of deletions in this sample's pileup
int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
// if extended events are requested, and if previous traversal step brought us over an indel in while (iterator.hasNext()) {
// at least one read, we emit extended pileup (making sure that it is associated with the previous base, final SAMRecordState state = iterator.next(); // state object with the read/offset information
// i.e. the one right *before* the indel) and do NOT shift the current position on the ref. final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
// In this case, the subsequent call to next() will emit the normal pileup at the current base final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
// and shift the position. final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
if (readInfo.generateExtendedEvents() && hasExtendedEvents) { final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
Map<String, ReadBackedExtendedEventPileupImpl> fullExtendedEventPileup = new HashMap<String, ReadBackedExtendedEventPileupImpl>(); final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
final int readOffset = state.getReadOffset(); // the base offset on this read
// get current location on the reference and decrement it by 1: the indels we just stepped over final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
// are associated with the *previous* reference base final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
GenomeLoc loc = genomeLocParser.incPos(getLocation(), -1); final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION;
final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
boolean hasBeenSampled = false; int nextElementLength = nextElement.getLength();
for (final String sample : samples) {
Iterator<SAMRecordState> iterator = readStates.iterator(sample);
List<ExtendedEventPileupElement> indelPile = new ArrayList<ExtendedEventPileupElement>(readStates.size(sample));
hasBeenSampled |= loc.getStart() <= readStates.getDownsamplingExtent(sample);
size = 0; if (op == CigarOperator.N) // N's are never added to any pileup
nDeletions = 0; continue;
nInsertions = 0;
nMQ0Reads = 0;
int maxDeletionLength = 0;
while (iterator.hasNext()) { if (op == CigarOperator.D) {
final SAMRecordState state = iterator.next(); if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
final int readOffset = state.getReadOffset(); // the base offset on this read
final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began.
final int eventLength = state.getEventLength();
if (op == CigarOperator.N) // N's are never added to any pileup
continue;
if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref
size++; size++;
ExtendedEventPileupElement pileupElement; nDeletions++;
if (state.getEventBases() == null) { // Deletion event
nDeletions++;
maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength());
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength);
}
else { // Insertion event
nInsertions++;
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases());
}
if (read.getMappingQuality() == 0)
nMQ0Reads++;
indelPile.add(pileupElement);
}
// this read has no indel so add it to the pileup as a NOEVENT:
// a deletion that didn't start here (therefore, not an extended event)
// we add (mis)matches as no events.
else if (op != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci()) {
size++;
indelPile.add(new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), readOffset));
if (read.getMappingQuality() == 0) if (read.getMappingQuality() == 0)
nMQ0Reads++; nMQ0Reads++;
} }
} }
else {
if (!filterBaseInRead(read, location.getStart())) {
String insertedBaseString = null;
if (nextOp == CigarOperator.I)
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength()));
if (indelPile.size() != 0) pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
fullExtendedEventPileup.put(sample, new ReadBackedExtendedEventPileupImpl(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads)); size++;
} if (read.getMappingQuality() == 0)
hasExtendedEvents = false; // we are done with extended events prior to current ref base nMQ0Reads++;
nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled);
}
else { // this is a regular event pileup (not extended)
final GenomeLoc location = getLocation();
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
boolean hasBeenSampled = false;
for (final String sample : samples) {
final Iterator<SAMRecordState> iterator = readStates.iterator(sample);
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
size = 0; // number of elements in this sample's pileup
nDeletions = 0; // number of deletions in this sample's pileup
nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
while (iterator.hasNext()) {
final SAMRecordState state = iterator.next(); // state object with the read/offset information
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
final int readOffset = state.getReadOffset(); // the base offset on this read
final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION;
final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
int nextElementLength = nextElement.getLength();
if (op == CigarOperator.N) // N's are never added to any pileup
continue;
if (op == CigarOperator.D) {
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
size++;
nDeletions++;
if (read.getMappingQuality() == 0)
nMQ0Reads++;
}
}
else {
if (!filterBaseInRead(read, location.getStart())) {
String insertedBaseString = null;
if (nextOp == CigarOperator.I)
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength()));
pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
size++;
if (read.getMappingQuality() == 0)
nMQ0Reads++;
}
} }
} }
if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
} }
updateReadStates(); // critical - must be called after we get the current state offsets and location if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
} }
updateReadStates(); // critical - must be called after we get the current state offsets and location
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
} }
} }
@ -546,9 +376,7 @@ public class LocusIteratorByState extends LocusIterator {
while (it.hasNext()) { while (it.hasNext()) {
SAMRecordState state = it.next(); SAMRecordState state = it.next();
CigarOperator op = state.stepForwardOnGenome(); CigarOperator op = state.stepForwardOnGenome();
if (state.hadIndel() && readInfo.generateExtendedEvents()) if (op == null) {
hasExtendedEvents = true;
else if (op == null) {
// we discard the read only when we are past its end AND indel at the end of the read (if any) was // we discard the read only when we are past its end AND indel at the end of the read (if any) was
// already processed. Keeping the read state that retunred null upon stepForwardOnGenome() is safe // already processed. Keeping the read state that retunred null upon stepForwardOnGenome() is safe
// as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
@ -757,12 +585,9 @@ public class LocusIteratorByState extends LocusIterator {
int readCount = 0; int readCount = 0;
for (SAMRecord read : reads) { for (SAMRecord read : reads) {
if (readCount < maxReads) { if (readCount < maxReads) {
SAMRecordState state = new SAMRecordState(read, readInfo.generateExtendedEvents()); SAMRecordState state = new SAMRecordState(read);
state.stepForwardOnGenome(); state.stepForwardOnGenome();
newReadStates.add(state); newReadStates.add(state);
// TODO: What if we downsample the extended events away?
if (state.hadIndel())
hasExtendedEvents = true;
readCount++; readCount++;
} }
} }

View File

@ -251,7 +251,7 @@ public class VariantContextAdaptors {
Map<String, Object> attributes = new HashMap<String, Object>(); Map<String, Object> attributes = new HashMap<String, Object>();
Collection<Genotype> genotypes = new ArrayList<Genotype>(); Collection<Genotype> genotypes = new ArrayList<Genotype>();
Genotype call = new Genotype(name, genotypeAlleles); Genotype call = GenotypeBuilder.create(name, genotypeAlleles);
// add the call to the genotype list, and then use this list to create a VariantContext // add the call to the genotype list, and then use this list to create a VariantContext
genotypes.add(call); genotypes.add(call);
@ -344,7 +344,7 @@ public class VariantContextAdaptors {
alleles.add(allele2); alleles.add(allele2);
} }
Genotype g = new Genotype(samples[i], myAlleles); Genotype g = GenotypeBuilder.create(samples[i], myAlleles);
genotypes.add(g); genotypes.add(g);
} }

View File

@ -53,19 +53,6 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
dataProvider.getShard().getReadMetrics().incrementNumIterations(); dataProvider.getShard().getReadMetrics().incrementNumIterations();
if ( locus.hasExtendedEventPileup() ) {
// if the alignment context we received holds an "extended" pileup (i.e. pileup of insertions/deletions
// associated with the current site), we need to update the location. The updated location still starts
// at the current genomic position, but it has to span the length of the longest deletion (if any).
location = engine.getGenomeLocParser().setStop(location,location.getStop()+locus.getExtendedEventPileup().getMaxDeletionLength());
// it is possible that the new expanded location spans the current shard boundary; the next method ensures
// that when it is the case, the reference sequence held by the ReferenceView will be reloaded so that
// the view has all the bases we are gonna need. If the location fits within the current view bounds,
// the next call will not do anything to the view:
referenceView.expandBoundsToAccomodateLoc(location);
}
// create reference context. Note that if we have a pileup of "extended events", the context will // create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
ReferenceContext refContext = referenceView.getReferenceContext(location); ReferenceContext refContext = referenceView.getReferenceContext(location);

View File

@ -34,9 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.io.PrintStream; import java.io.PrintStream;
@ -79,13 +77,11 @@ public class PileupWalker extends LocusWalker<Integer, Integer> implements TreeR
String rods = getReferenceOrderedData( tracker ); String rods = getReferenceOrderedData( tracker );
if ( context.hasBasePileup() ) { ReadBackedPileup basePileup = context.getBasePileup();
ReadBackedPileup basePileup = context.getBasePileup(); out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods);
out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods); if ( SHOW_VERBOSE )
if ( SHOW_VERBOSE ) out.printf(" %s", createVerboseOutput(basePileup));
out.printf(" %s", createVerboseOutput(basePileup)); out.println();
out.println();
}
return 1; return 1;
} }

View File

@ -30,11 +30,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
@ -72,7 +70,7 @@ public class AlleleBalance extends InfoFieldAnnotation {
// we care only about het calls // we care only about het calls
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
if ( context == null || !context.hasBasePileup() ) if ( context == null )
continue; continue;
final ReadBackedPileup pileup = context.getBasePileup(); final ReadBackedPileup pileup = context.getBasePileup();

View File

@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*; import java.util.*;
@ -21,15 +22,12 @@ import java.util.*;
*/ */
public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation { public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) { public void annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g, final GenotypeBuilder gb) {
Double ratio = annotateSNP(stratifiedContext, vc, g); Double ratio = annotateSNP(stratifiedContext, vc, g);
if (ratio == null) if (ratio == null)
return null; return;
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%.2f", ratio.doubleValue()));
return map;
gb.attribute(getKeyNames().get(0), Double.valueOf(String.format("%.2f", ratio.doubleValue())));
} }
private Double annotateSNP(AlignmentContext stratifiedContext, VariantContext vc, Genotype g) { private Double annotateSNP(AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
@ -51,9 +49,6 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim
if ( altAlleles.size() == 0 ) if ( altAlleles.size() == 0 )
return null; return null;
if ( !stratifiedContext.hasBasePileup() )
return null;
final String bases = new String(stratifiedContext.getBasePileup().getBases()); final String bases = new String(stratifiedContext.getBasePileup().getBases());
if ( bases.length() == 0 ) if ( bases.length() == 0 )
return null; return null;

View File

@ -59,8 +59,6 @@ public class BaseCounts extends InfoFieldAnnotation {
int[] counts = new int[4]; int[] counts = new int[4];
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) { for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
if ( !sample.getValue().hasBasePileup() )
continue;
for (byte base : sample.getValue().getBasePileup().getBases() ) { for (byte base : sample.getValue().getBasePileup().getBases() ) {
int index = BaseUtils.simpleBaseToBaseIndex(base); int index = BaseUtils.simpleBaseToBaseIndex(base);
if ( index != -1 ) if ( index != -1 )

View File

@ -44,7 +44,7 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno
int depth = 0; int depth = 0;
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() )
depth += sample.getValue().hasBasePileup() ? sample.getValue().getBasePileup().depthOfCoverage() : 0; depth += sample.getValue().getBasePileup().depthOfCoverage();
Map<String, Object> map = new HashMap<String, Object>(); Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%d", depth)); map.put(getKeyNames().get(0), String.format("%d", depth));
return map; return map;

View File

@ -1,12 +1,12 @@
package org.broadinstitute.sting.gatk.walkers.annotator; package org.broadinstitute.sting.gatk.walkers.annotator;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
@ -14,6 +14,7 @@ import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*; import java.util.*;
@ -44,22 +45,17 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
private static final String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time private static final String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) { public void annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g, GenotypeBuilder gb) {
if ( g == null || !g.isCalled() ) if ( g == null || !g.isCalled() )
return null; return;
if ( vc.isSNP() ) if ( vc.isSNP() )
return annotateSNP(stratifiedContext, vc); annotateSNP(stratifiedContext, vc, gb);
if ( vc.isIndel() ) else if ( vc.isIndel() )
return annotateIndel(stratifiedContext, vc); annotateIndel(stratifiedContext, vc, gb);
return null;
} }
private Map<String,Object> annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) { private void annotateSNP(AlignmentContext stratifiedContext, VariantContext vc, GenotypeBuilder gb) {
if ( ! stratifiedContext.hasBasePileup() )
return null;
HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>(); HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
for ( Allele allele : vc.getAlleles() ) for ( Allele allele : vc.getAlleles() )
@ -72,22 +68,18 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
} }
// we need to add counts in the correct order // we need to add counts in the correct order
Integer[] counts = new Integer[alleleCounts.size()]; int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(vc.getReference().getBases()[0]); counts[0] = alleleCounts.get(vc.getReference().getBases()[0]);
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]); counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
return toADAnnotation(counts); gb.AD(counts);
} }
private Map<String,Object> annotateIndel(AlignmentContext stratifiedContext, VariantContext vc) { private void annotateIndel(AlignmentContext stratifiedContext, VariantContext vc, GenotypeBuilder gb) {
if ( ! stratifiedContext.hasBasePileup() )
return null;
ReadBackedPileup pileup = stratifiedContext.getBasePileup(); ReadBackedPileup pileup = stratifiedContext.getBasePileup();
if ( pileup == null ) if ( pileup == null )
return null; return;
final HashMap<String, Integer> alleleCounts = new HashMap<String, Integer>(); final HashMap<String, Integer> alleleCounts = new HashMap<String, Integer>();
alleleCounts.put(REF_ALLELE, 0); alleleCounts.put(REF_ALLELE, 0);
@ -123,16 +115,12 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
} }
} }
Integer[] counts = new Integer[alleleCounts.size()]; int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(REF_ALLELE); counts[0] = alleleCounts.get(REF_ALLELE);
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
counts[i+1] = alleleCounts.get( getAlleleRepresentation(vc.getAlternateAllele(i)) ); counts[i+1] = alleleCounts.get( getAlleleRepresentation(vc.getAlternateAllele(i)) );
return toADAnnotation(counts); gb.AD(counts);
}
private final Map<String, Object> toADAnnotation(final Integer[] counts) {
return Collections.singletonMap(getKeyNames().get(0), (Object)Arrays.asList(counts));
} }
private String getAlleleRepresentation(Allele allele) { private String getAlleleRepresentation(Allele allele) {
@ -145,7 +133,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
} }
// public String getIndelBases() // public String getIndelBases()
public List<String> getKeyNames() { return Arrays.asList("AD"); } public List<String> getKeyNames() { return Arrays.asList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); }
public List<VCFFormatHeaderLine> getDescriptions() { public List<VCFFormatHeaderLine> getDescriptions() {
return Arrays.asList( return Arrays.asList(

View File

@ -296,7 +296,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
for ( String sample : stratifiedContexts.keySet() ) { for ( String sample : stratifiedContexts.keySet() ) {
final AlignmentContext context = stratifiedContexts.get(sample); final AlignmentContext context = stratifiedContexts.get(sample);
if ( context == null || !context.hasBasePileup() ) if ( context == null )
continue; continue;
final ReadBackedPileup pileup = context.getBasePileup(); final ReadBackedPileup pileup = context.getBasePileup();

View File

@ -74,9 +74,6 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
final int locus = ref.getLocus().getStart() + (ref.getLocus().getStop() - ref.getLocus().getStart()) / 2; final int locus = ref.getLocus().getStart() + (ref.getLocus().getStop() - ref.getLocus().getStart()) / 2;
if ( !context.hasBasePileup() )
return null;
final ReadBackedPileup pileup = context.getBasePileup(); final ReadBackedPileup pileup = context.getBasePileup();
// Compute all haplotypes consistent with the current read pileup // Compute all haplotypes consistent with the current read pileup
@ -86,7 +83,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot
if (haplotypes != null) { if (haplotypes != null) {
for (final Genotype genotype : vc.getGenotypes()) { for (final Genotype genotype : vc.getGenotypes()) {
final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName()); final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName());
if (thisContext != null && thisContext.hasBasePileup()) { if (thisContext != null) {
final ReadBackedPileup thisPileup = thisContext.getBasePileup(); final ReadBackedPileup thisPileup = thisContext.getBasePileup();
if (vc.isSNP()) if (vc.isSNP())
scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense

View File

@ -31,9 +31,6 @@ public class LowMQ extends InfoFieldAnnotation {
double total = 0; double total = 0;
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() )
{ {
if ( !sample.getValue().hasBasePileup() )
continue;
for ( PileupElement p : sample.getValue().getBasePileup() ) for ( PileupElement p : sample.getValue().getBasePileup() )
{ {
if ( p.getMappingQual() == 0 ) { mq0 += 1; } if ( p.getMappingQual() == 0 ) { mq0 += 1; }

View File

@ -31,12 +31,10 @@ public class MappingQualityZero extends InfoFieldAnnotation implements StandardA
int mq0 = 0; int mq0 = 0;
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) { for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
final AlignmentContext context = sample.getValue(); final AlignmentContext context = sample.getValue();
if ( context.hasBasePileup() ) { final ReadBackedPileup pileup = context.getBasePileup();
final ReadBackedPileup pileup = context.getBasePileup(); for (PileupElement p : pileup ) {
for (PileupElement p : pileup ) { if ( p.getMappingQual() == 0 )
if ( p.getMappingQual() == 0 ) mq0++;
mq0++;
}
} }
} }
Map<String, Object> map = new HashMap<String, Object>(); Map<String, Object> map = new HashMap<String, Object>();

View File

@ -36,33 +36,30 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
/** /**
* Count for each sample of mapping quality zero reads * Count for each sample of mapping quality zero reads
*/ */
public class MappingQualityZeroBySample extends GenotypeAnnotation { public class MappingQualityZeroBySample extends GenotypeAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, public void annotate(RefMetaDataTracker tracker,
AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext context, VariantContext vc, Genotype g) { AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext context,
VariantContext vc, Genotype g, GenotypeBuilder gb) {
if ( g == null || !g.isCalled() ) if ( g == null || !g.isCalled() )
return null; return;
int mq0 = 0; int mq0 = 0;
if ( context.hasBasePileup() ) { final ReadBackedPileup pileup = context.getBasePileup();
final ReadBackedPileup pileup = context.getBasePileup(); for (PileupElement p : pileup ) {
for (PileupElement p : pileup ) { if ( p.getMappingQual() == 0 )
if ( p.getMappingQual() == 0 ) mq0++;
mq0++;
}
} }
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%d", mq0)); gb.attribute(getKeyNames().get(0), mq0);
return map;
} }
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.MAPPING_QUALITY_ZERO_KEY); } public List<String> getKeyNames() { return Arrays.asList(VCFConstants.MAPPING_QUALITY_ZERO_KEY); }

View File

@ -31,12 +31,10 @@ public class MappingQualityZeroFraction extends InfoFieldAnnotation implements E
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) { for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
AlignmentContext context = sample.getValue(); AlignmentContext context = sample.getValue();
depth += context.size(); depth += context.size();
if ( context.hasBasePileup() ) { final ReadBackedPileup pileup = context.getBasePileup();
final ReadBackedPileup pileup = context.getBasePileup(); for (PileupElement p : pileup ) {
for (PileupElement p : pileup ) { if ( p.getMappingQual() == 0 )
if ( p.getMappingQual() == 0 ) mq0++;
mq0++;
}
} }
} }
if (depth > 0) { if (depth > 0) {

View File

@ -28,15 +28,13 @@ public class NBaseCount extends InfoFieldAnnotation {
int countRegularBaseSolid = 0; int countRegularBaseSolid = 0;
for( final AlignmentContext context : stratifiedContexts.values() ) { for( final AlignmentContext context : stratifiedContexts.values() ) {
if ( context.hasBasePileup() ) { // must be called as getBasePileup may throw error when pileup has no bases for( final PileupElement p : context.getBasePileup()) {
for( final PileupElement p : context.getBasePileup()) { final String platform = p.getRead().getReadGroup().getPlatform();
final String platform = p.getRead().getReadGroup().getPlatform(); if( platform != null && platform.toUpperCase().contains("SOLID") ) {
if( platform != null && platform.toUpperCase().contains("SOLID") ) { if( BaseUtils.isNBase( p.getBase() ) ) {
if( BaseUtils.isNBase( p.getBase() ) ) { countNBaseSolid++;
countNBaseSolid++; } else if( BaseUtils.isRegularBase( p.getBase() ) ) {
} else if( BaseUtils.isRegularBase( p.getBase() ) ) { countRegularBaseSolid++;
countRegularBaseSolid++;
}
} }
} }
} }

View File

@ -48,7 +48,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
if ( context == null ) if ( context == null )
continue; continue;
depth += context.hasBasePileup() ? context.getBasePileup().depthOfCoverage() : 0; depth += context.getBasePileup().depthOfCoverage();
} }
if ( depth == 0 ) if ( depth == 0 )

View File

@ -42,12 +42,10 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) { for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
AlignmentContext context = sample.getValue(); AlignmentContext context = sample.getValue();
if ( context.hasBasePileup() ) { final ReadBackedPileup pileup = context.getBasePileup();
final ReadBackedPileup pileup = context.getBasePileup(); for (PileupElement p : pileup ) {
for (PileupElement p : pileup ) { if ( p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE )
if ( p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) qualities[index++] = p.getMappingQual();
qualities[index++] = p.getMappingQual();
}
} }
} }

View File

@ -63,9 +63,6 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
continue; continue;
} }
if (!context.hasBasePileup())
continue;
final ReadBackedPileup pileup = context.getBasePileup(); final ReadBackedPileup pileup = context.getBasePileup();
if (pileup == null) if (pileup == null)
continue; continue;

View File

@ -35,11 +35,9 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn
int depth = 0; int depth = 0;
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) { for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
AlignmentContext context = sample.getValue(); AlignmentContext context = sample.getValue();
if ( context.hasBasePileup() ) { final ReadBackedPileup pileup = context.getBasePileup();
final ReadBackedPileup pileup = context.getBasePileup(); deletions += pileup.getNumberOfDeletions();
deletions += pileup.getNumberOfDeletions(); depth += pileup.getNumberOfElements();
depth += pileup.getNumberOfElements();
}
} }
Map<String, Object> map = new HashMap<String, Object>(); Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth)); map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth));

View File

@ -39,18 +39,16 @@ public class TechnologyComposition extends InfoFieldAnnotation implements Experi
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) { for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
AlignmentContext context = sample.getValue(); AlignmentContext context = sample.getValue();
if ( context.hasBasePileup() ) { final ReadBackedPileup pileup = context.getBasePileup();
final ReadBackedPileup pileup = context.getBasePileup(); for ( PileupElement p : pileup ) {
for ( PileupElement p : pileup ) { if(ReadUtils.is454Read(p.getRead()))
if(ReadUtils.is454Read(p.getRead())) reads454++;
reads454++; else if (ReadUtils.isSOLiDRead(p.getRead()))
else if (ReadUtils.isSOLiDRead(p.getRead())) readsSolid++;
readsSolid++; else if (ReadUtils.isIlluminaRead(p.getRead()))
else if (ReadUtils.isIlluminaRead(p.getRead())) readsIllumina++;
readsIllumina++; else
else readsOther++;
readsOther++;
}
} }
} }

View File

@ -305,12 +305,10 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
// if the reference base is not ambiguous, we can annotate // if the reference base is not ambiguous, we can annotate
Map<String, AlignmentContext> stratifiedContexts; Map<String, AlignmentContext> stratifiedContexts;
if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) { if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
if ( context.hasBasePileup() ) { stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup());
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup()); annotatedVCs = new ArrayList<VariantContext>(VCs.size());
annotatedVCs = new ArrayList<VariantContext>(VCs.size()); for ( VariantContext vc : VCs )
for ( VariantContext vc : VCs ) annotatedVCs.add(engine.annotateContext(tracker, ref, stratifiedContexts, vc));
annotatedVCs.add(engine.annotateContext(tracker, ref, stratifiedContexts, vc));
}
} }
for ( VariantContext annotatedVC : annotatedVCs ) for ( VariantContext annotatedVC : annotatedVCs )

View File

@ -261,24 +261,22 @@ public class VariantAnnotatorEngine {
} }
private GenotypesContext annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) { private GenotypesContext annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( requestedGenotypeAnnotations.size() == 0 ) if ( requestedGenotypeAnnotations.isEmpty() )
return vc.getGenotypes(); return vc.getGenotypes();
GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples());
for ( final Genotype genotype : vc.getGenotypes() ) { for ( final Genotype genotype : vc.getGenotypes() ) {
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
if ( context == null ) { if ( context == null ) {
genotypes.add(genotype); genotypes.add(genotype);
continue; } else {
final GenotypeBuilder gb = new GenotypeBuilder(genotype);
for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
annotation.annotate(tracker, walker, ref, context, vc, genotype, gb);
}
genotypes.add(gb.make());
} }
Map<String, Object> genotypeAnnotations = new HashMap<String, Object>(genotype.getAttributes());
for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
Map<String, Object> result = annotation.annotate(tracker, walker, ref, context, vc, genotype);
if ( result != null )
genotypeAnnotations.putAll(result);
}
genotypes.add(new Genotype(genotype.getSampleName(), genotype.getAlleles(), genotype.getLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.isPhased()));
} }
return genotypes; return genotypes;

View File

@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.List; import java.util.List;
@ -13,8 +14,9 @@ import java.util.Map;
public abstract class GenotypeAnnotation extends VariantAnnotatorAnnotation { public abstract class GenotypeAnnotation extends VariantAnnotatorAnnotation {
// return annotations for the given contexts/genotype split by sample // return annotations for the given contexts/genotype split by sample
public abstract Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, public abstract void annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker,
ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g); ReferenceContext ref, AlignmentContext stratifiedContext,
VariantContext vc, Genotype g, GenotypeBuilder gb );
// return the descriptions used for the VCF FORMAT meta field // return the descriptions used for the VCF FORMAT meta field
public abstract List<VCFFormatHeaderLine> getDescriptions(); public abstract List<VCFFormatHeaderLine> getDescriptions();

View File

@ -204,8 +204,6 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
} }
for ( final Genotype g : vc_input.getGenotypes() ) { for ( final Genotype g : vc_input.getGenotypes() ) {
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
boolean genotypeIsPhased = true; boolean genotypeIsPhased = true;
String sample = g.getSampleName(); String sample = g.getSampleName();
@ -271,7 +269,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
// Compute new GQ field = -10*log10Pr(Genotype call is wrong) // Compute new GQ field = -10*log10Pr(Genotype call is wrong)
// Beagle gives probability that genotype is AA, AB and BB. // Beagle gives probability that genotype is AA, AB and BB.
// Which, by definition, are prob of hom ref, het and hom var. // Which, by definition, are prob of hom ref, het and hom var.
Double probWrongGenotype, genotypeQuality; double probWrongGenotype, genotypeQuality;
Double homRefProbability = Double.valueOf(beagleProbabilities.get(0)); Double homRefProbability = Double.valueOf(beagleProbabilities.get(0));
Double hetProbability = Double.valueOf(beagleProbabilities.get(1)); Double hetProbability = Double.valueOf(beagleProbabilities.get(1));
Double homVarProbability = Double.valueOf(beagleProbabilities.get(2)); Double homVarProbability = Double.valueOf(beagleProbabilities.get(2));
@ -300,7 +298,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
else else
genotypeQuality = log10(probWrongGenotype); genotypeQuality = log10(probWrongGenotype);
HashMap<String,Object> originalAttributes = new HashMap<String,Object>(g.getAttributes()); HashMap<String,Object> originalAttributes = new HashMap<String,Object>(g.getExtendedAttributes());
// get original encoding and add to keynotype attributes // get original encoding and add to keynotype attributes
String a1, a2, og; String a1, a2, og;
@ -328,7 +326,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
else { else {
originalAttributes.put("OG","."); originalAttributes.put("OG",".");
} }
Genotype imputedGenotype = new Genotype(g.getSampleName(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased); Genotype imputedGenotype = new GenotypeBuilder(g).alleles(alleles).log10PError(genotypeQuality).attributes(originalAttributes).phased(genotypeIsPhased).make();
if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) { if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) {
beagleVarCounts++; beagleVarCounts++;
} }

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.bqsr; package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BitSetUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.*; import java.util.*;
@ -26,147 +27,135 @@ import java.util.*;
*/ */
public class BQSRKeyManager { public class BQSRKeyManager {
private final List<Covariate> requiredCovariates; private final Covariate[] requiredCovariates;
private final List<Covariate> optionalCovariates; private final Covariate[] optionalCovariates;
private final List<RequiredCovariateInfo> requiredCovariatesInfo; private final RequiredCovariateInfo[] requiredCovariatesInfo;
private final List<OptionalCovariateInfo> optionalCovariatesInfo; private final OptionalCovariateInfo[] optionalCovariatesInfo;
private final Map<String, Short> covariateNameToIDMap; private final Map<String, Short> covariateNameToIDMap;
private int nRequiredBits; // Number of bits used to represent the required covariates private int nRequiredBits; // Number of bits used to represent the required covariates
private int nOptionalBits; // Number of bits used to represent the standard covaraites
private final int nOptionalIDBits; // Number of bits used to represent the optional covariates IDs private final int optionalCovariateOffset;
private final int totalNumberOfBits; // Sum of all of the above plus the event bits private final int optionalCovariateIDOffset;
private final BitSet optionalCovariateMask; // Standard mask for optional covariates bitset private final long optionalCovariateMask; // Standard mask for optional covariates key
private final BitSet optionalCovariateIDMask; // Standard mask for optional covariates order bitset private final long optionalCovariateIDMask; // Standard mask for optional covariates order key
private final long eventIDMask; // Standard mask for event ID
/** /**
* Initializes the KeyManager with the total number of covariates to use * Initializes the KeyManager with the total number of covariates to use
* *
* @param requiredCovariates the ordered list of required covariates * @param requiredCovariates the ordered list of required covariates
* @param optionalCovariates the ordered list of optional covariates * @param optionalCovariates the ordered list of optional covariates
*/ */
public BQSRKeyManager(List<Covariate> requiredCovariates, List<Covariate> optionalCovariates) { public BQSRKeyManager(final List<Covariate> requiredCovariates, final List<Covariate> optionalCovariates) {
this.requiredCovariates = new ArrayList<Covariate>(requiredCovariates); this.requiredCovariates = new Covariate[requiredCovariates.size()];
this.optionalCovariates = new ArrayList<Covariate>(optionalCovariates); this.optionalCovariates = new Covariate[optionalCovariates.size()];
requiredCovariatesInfo = new ArrayList<RequiredCovariateInfo>(requiredCovariates.size()); // initialize the required covariates list requiredCovariatesInfo = new RequiredCovariateInfo[requiredCovariates.size()]; // initialize the required covariates list
optionalCovariatesInfo = new ArrayList<OptionalCovariateInfo>(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay) optionalCovariatesInfo = new OptionalCovariateInfo[optionalCovariates.size()]; // initialize the optional covariates list (size may be 0, it's okay)
covariateNameToIDMap = new HashMap<String, Short>(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates) covariateNameToIDMap = new HashMap<String, Short>(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates)
nRequiredBits = 0; nRequiredBits = 0;
for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management for (int i = 0; i < requiredCovariates.size(); i++) { // create a list of required covariates with the extra information for key management
int nBits = required.numberOfBits(); // number of bits used by this covariate final Covariate required = requiredCovariates.get(i);
BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate final int nBits = required.numberOfBits(); // number of bits used by this covariate
requiredCovariatesInfo.add(new RequiredCovariateInfo(nRequiredBits, mask, required)); // Create an object for this required covariate final long mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate
this.requiredCovariates[i] = required;
requiredCovariatesInfo[i] = new RequiredCovariateInfo(nBits, nRequiredBits, mask, required); // Create an object for this required covariate
nRequiredBits += nBits; nRequiredBits += nBits;
} }
final int bitsInEventType = numberOfBitsToRepresent(EventType.values().length);
eventIDMask = genericMask(nRequiredBits, bitsInEventType);
short id = 0; short id = 0;
nOptionalBits = 0; int nOptionalBits = 0;
for (Covariate optional : optionalCovariates) { for (int i = 0; i < optionalCovariates.size(); i++) {
int nBits = optional.numberOfBits(); // number of bits used by this covariate final Covariate optional = optionalCovariates.get(i);
nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate nOptionalBits = Math.max(nOptionalBits, optional.numberOfBits()); // optional covariates are represented by the number of bits needed by biggest covariate
BitSet optionalID = bitSetFromId(id); // calculate the optional covariate ID for this covariate this.optionalCovariates[i] = optional;
optionalCovariatesInfo.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object optionalCovariatesInfo[i] = new OptionalCovariateInfo(id, optional);
String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport final String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport
covariateNameToIDMap.put(covariateName, id); covariateNameToIDMap.put(covariateName, id);
id++; id++;
} }
nOptionalIDBits = BitSetUtils.numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID optionalCovariateOffset = nRequiredBits + bitsInEventType;
optionalCovariateMask = genericMask(nRequiredBits, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset optionalCovariateMask = genericMask(optionalCovariateOffset, nOptionalBits); // the generic mask to extract optional covariate bits from the combined bitset
optionalCovariateIDMask = genericMask(nRequiredBits + nOptionalBits, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset optionalCovariateIDOffset = nRequiredBits + bitsInEventType + nOptionalBits;
totalNumberOfBits = nRequiredBits + nOptionalBits + nOptionalIDBits + bitsInEventType(); // total number of bits used in the final key final int nOptionalIDBits = numberOfBitsToRepresent(optionalCovariates.size()); // number of bits used to represent the covariate ID
optionalCovariateIDMask = genericMask(optionalCovariateIDOffset, nOptionalIDBits); // the generic mask to extract optional covariate ID bits from the combined bitset
final int totalNumberOfBits = optionalCovariateIDOffset + nOptionalIDBits; // total number of bits used in the final key
if ( totalNumberOfBits > 64 )
throw new UserException.BadInput("The total number of bits used for the master BQSR key is greater than 64 and cannot be represented in a long");
} }
/** /**
* Generates one key per optional covariate. * Generates one key given the optional covariate (or none if it is null)
* *
* Keys include all required covariates, the standard covariate and the event type. * Keys include all required covariates, the standard covariate and the event type.
* *
* Example allKeys: * @param allKeys The keys in long representation for each covariate (includes all optional covariates, not just the one requested)
* RG, QUAL, CYCLE, CONTEXT * @param eventType The type of event described by this keyset (e.g. mismatches, insertions, deletions)
* * @return one key in long representation (non-negative) or -1 for a bad key
* List of BitSets returned by this example (given eventType):
* RG, QUAL, CYCLE, EVENT
* RG, QUAL, CONTEXT, EVENT
*
* Note: If there are no optional covariates, only one bitset key will be returned with all the required covariates and the event type
*
* @param allKeys The keys in bitset representation for each covariate
* @param eventType The type of event described by this keyset (e.g. mismatches, insertions, deletions)
* @return one key in bitset representation per covariate
*/ */
public List<BitSet> bitSetsFromAllKeys(BitSet[] allKeys, EventType eventType) { public long createMasterKey(final long[] allKeys, final EventType eventType, final int optionalCovariateIndex) {
List<BitSet> allBitSets = new ArrayList<BitSet>(); // Generate one key per optional covariate
BitSet eventBitSet = bitSetFromEvent(eventType); // create a bitset with the event type int keyIndex = 0;
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits long masterKey = 0L; // This will be a master key holding all the required keys, to replicate later on
int covariateIndex = 0;
BitSet requiredKey = new BitSet(nRequiredBits); // This will be a bitset holding all the required keys, to replicate later on
for (RequiredCovariateInfo infoRequired : requiredCovariatesInfo) for (RequiredCovariateInfo infoRequired : requiredCovariatesInfo)
addBitSetToKeyAtLocation(requiredKey, allKeys[covariateIndex++], infoRequired.bitsBefore); // Add all the required covariates to the key set masterKey |= (allKeys[keyIndex++] << infoRequired.offset);
for (OptionalCovariateInfo infoOptional : optionalCovariatesInfo) { final long eventKey = keyFromEvent(eventType); // create a key for the event type
BitSet covariateKey = allKeys[covariateIndex++]; // get the bitset from all keys masterKey |= (eventKey << nRequiredBits);
if (covariateKey == null)
continue; // do not add nulls to the final set of keys.
BitSet optionalKey = new BitSet(totalNumberOfBits); // create a new key for this optional covariate if (optionalCovariateIndex >= 0 && optionalCovariateIndex < optionalCovariates.length) {
optionalKey.or(requiredKey); // import all the required covariates final long covariateKey = allKeys[keyIndex + optionalCovariateIndex];
addBitSetToKeyAtLocation(optionalKey, covariateKey, nRequiredBits); // add the optional covariate right after the required covariates if (covariateKey < 0) // do not add "nulls" to the final set of keys
addBitSetToKeyAtLocation(optionalKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite return -1;
addBitSetToKeyAtLocation(optionalKey, eventBitSet, eventTypeBitIndex); // Add the event type
allBitSets.add(optionalKey); // add this key to the list of keys masterKey |= (covariateKey << optionalCovariateOffset);
masterKey |= (optionalCovariatesInfo[optionalCovariateIndex].covariateID << optionalCovariateIDOffset);
} }
if (optionalCovariatesInfo.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key) return masterKey;
addBitSetToKeyAtLocation(requiredKey, eventBitSet, eventTypeBitIndex); // Add the event type
allBitSets.add(requiredKey); // add this key to the list of keys
}
return allBitSets;
} }
/** /**
* Generates one bitset key for the covariates represented in Object[] key * Generates one key for the covariates represented in Object[] key
* *
* The covariates will have the actual objects produced by the covariates (probably read from the recalibration data file) * The covariates will have the actual objects produced by the covariates (probably read from the recalibration data file)
* and will contain all required covariates and one (or none) optional covariates. Therefore, the product is one bitset key, not many. * and will contain all required covariates and one (or none) optional covariates. Therefore, the product is one key, not many.
* *
* Example key: * Example key:
* RG, QUAL, CYCLE, CYCLE_ID, EventType * RG, QUAL, CYCLE, CYCLE_ID, EventType
* *
* @param key list of objects produced by the required covariates followed by one or zero optional covariates. * @param key list of objects produced by the required covariates followed by one or zero optional covariates.
* @return a bitset key representing these objects. Bitset encryption is done using the covariate's interface. * @return a key representing these objects.
*/ */
public BitSet bitSetFromKey(Object[] key) { public long longFromKey(Object[] key) {
BitSet bitSetKey = new BitSet(totalNumberOfBits);
int requiredCovariate = 0; int requiredCovariate = 0;
for (RequiredCovariateInfo infoRequired : requiredCovariatesInfo) { long masterKey = 0L; // This will be a master key holding all the required keys, to replicate later on
BitSet covariateBitSet = infoRequired.covariate.bitSetFromKey(key[requiredCovariate++]); // create a bitset from the object key provided using the required covariate's interface for (RequiredCovariateInfo infoRequired : requiredCovariatesInfo)
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, infoRequired.bitsBefore); // add it to the bitset key masterKey |= (infoRequired.covariate.longFromKey(key[requiredCovariate++]) << infoRequired.offset);
}
if (optionalCovariatesInfo.size() > 0) {
int optionalCovariate = requiredCovariatesInfo.size(); // the optional covariate index in the key array
int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's
int covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index
OptionalCovariateInfo infoOptional = optionalCovariatesInfo.get(covariateID); // so we can get the optional covariate information
BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface
addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, nRequiredBits); // add the optional covariate right after the required covariates
addBitSetToKeyAtLocation(bitSetKey, infoOptional.covariateID, nRequiredBits + nOptionalBits); // add the optional covariate ID right after the optional covarite
}
int eventIndex = key.length - 1; // the event type is always the last key
int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // location in the bit set to add the event type bits
BitSet eventBitSet = bitSetFromEvent((EventType) key[eventIndex]); // get the bit set representation of the event type
addBitSetToKeyAtLocation(bitSetKey, eventBitSet, eventTypeBitIndex); // add the event type
return bitSetKey; final int eventIndex = key.length - 1; // the event type is always the last key
final long eventKey = keyFromEvent((EventType) key[eventIndex]); // create a key for the event type
masterKey |= (eventKey << nRequiredBits);
if (optionalCovariatesInfo.length > 0) {
final int covariateIndex = requiredCovariatesInfo.length; // the optional covariate index in the key array
final int covariateIDIndex = covariateIndex + 1; // the optional covariate ID index is right after the optional covariate's
final short covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index
final OptionalCovariateInfo infoOptional = optionalCovariatesInfo[covariateID]; // so we can get the optional covariate information
final long covariateKey = infoOptional.covariate.longFromKey(key[covariateIndex]); // convert the optional covariate key into a bitset using the covariate's interface
masterKey |= (covariateKey << optionalCovariateOffset);
masterKey |= (infoOptional.covariateID << optionalCovariateIDOffset);
}
return masterKey;
} }
/** /**
@ -176,116 +165,82 @@ public class BQSRKeyManager {
* @param id the string or short representation of the optional covariate id * @param id the string or short representation of the optional covariate id
* @return the short representation of the optional covariate id. * @return the short representation of the optional covariate id.
*/ */
private short parseCovariateID(Object id) { private short parseCovariateID(final Object id) {
return (id instanceof String) ? covariateNameToIDMap.get(id.toString()) : (Short) id; return (id instanceof String) ? covariateNameToIDMap.get(id.toString()) : (Short) id;
} }
/** /**
* Generates a key set of objects from a combined bitset key. * Generates a key set of objects from a combined master key.
* *
* Masks out each covariate independently and decodes their values (Object) into a keyset * Masks out each covariate independently and decodes their values (Object) into a keyset
* *
* @param key the bitset representation of the keys * @param master the master representation of the keys
* @return an object array with the values for each key * @return an object array with the values for each key
*/ */
public List<Object> keySetFrom(BitSet key) { public List<Object> keySetFrom(final long master) {
List<Object> objectKeys = new ArrayList<Object>(); final List<Object> objectKeys = new ArrayList<Object>();
for (RequiredCovariateInfo info : requiredCovariatesInfo) { for (RequiredCovariateInfo info : requiredCovariatesInfo) {
BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset final long covariateKey = extractKeyFromMaster(master, info.mask, info.offset); // get the covariate's key
objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface objectKeys.add(info.covariate.formatKey(covariateKey)); // convert the key to object using covariate's interface
} }
if (optionalCovariatesInfo.size() > 0) { if (optionalCovariatesInfo.length > 0) {
BitSet covBitSet = extractBitSetFromKey(key, optionalCovariateMask, nRequiredBits); // mask out the covariate bit set final long covKey = extractKeyFromMaster(master, optionalCovariateMask, optionalCovariateOffset); // get the covariate's key
BitSet idbs = extractBitSetFromKey(key, optionalCovariateIDMask, nRequiredBits + nOptionalBits); // mask out the covariate order (to identify which covariate this is) final int covIDKey = (int)extractKeyFromMaster(master, optionalCovariateIDMask, optionalCovariateIDOffset); // get the covariate's id (to identify which covariate this is)
short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short Covariate covariate = optionalCovariatesInfo[(short)covIDKey].covariate; // get the corresponding optional covariate object
Covariate covariate = optionalCovariatesInfo.get(id).covariate; // get the corresponding optional covariate object objectKeys.add(covariate.formatKey(covKey)); // add the optional covariate key to the key set
objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set
objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id
} }
objectKeys.add(eventFromBitSet(key)); // add the event type object to the key set
objectKeys.add(EventType.eventFrom((int)extractKeyFromMaster(master, eventIDMask, nRequiredBits))); // add the event type object to the key set
return objectKeys; return objectKeys;
} }
public List<Covariate> getRequiredCovariates() { public Covariate[] getRequiredCovariates() {
return requiredCovariates; return requiredCovariates;
} }
public List<Covariate> getOptionalCovariates() { public Covariate[] getOptionalCovariates() {
return optionalCovariates; return optionalCovariates;
} }
/** public int getNumRequiredCovariates() {
* Translates a masked bitset into a bitset starting at 0 return requiredCovariates.length;
* }
* @param key the masked out bitset
* @param n the number of bits to chop public int getNumOptionalCovariates() {
* @return a translated bitset starting at 0 for the covariate machinery to decode return optionalCovariates.length;
*/
private BitSet chopNBitsFrom(BitSet key, int n) {
BitSet choppedKey = new BitSet();
for (int i = key.nextSetBit(0); i >= 0; i = key.nextSetBit(i + 1))
choppedKey.set(i - n); // Set every bit translocated to the beginning of the BitSet
return choppedKey;
} }
/** /**
* Creates a mask for the requested covariate to extract the relevant bitset from a combined bitset key * Creates a mask for the requested covariate to extract the relevant key from a combined master key
* *
* @param leadingBits the index of the covariate in the ordered covariate list * @param offset the offset into the master key
* @param nBits the number of bits needed by the Covariate to represent its values in BitSet form * @param nBits the number of bits needed by the Covariate to represent its values
* @return the bitset relevant to the covariate * @return the mask relevant to the covariate
*/ */
private long genericMask(final int offset, final int nBits) {
private BitSet genericMask(int leadingBits, int nBits) { long mask = 0L;
BitSet mask = new BitSet(leadingBits + nBits); for ( int i = 0; i < nBits; i++ )
mask.set(leadingBits, leadingBits + nBits); mask |= 1L << (offset+i);
return mask; return mask;
} }
/** private long extractKeyFromMaster(final long master, final long mask, final int offset) {
* Decodes the event type (enum) from the full bitset key long key = master & mask;
* return key >> offset;
* @param fullKey the full key of all covariates + event type
* @return the decoded event type.
*/
private EventType eventFromBitSet(BitSet fullKey) {
BitSet eventKey = new BitSet();
int firstBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits;
for (int i = fullKey.nextSetBit(firstBitIndex); i >= 0; i = fullKey.nextSetBit(i + 1))
eventKey.set(i - firstBitIndex);
return EventType.eventFrom(BitSetUtils.shortFrom(eventKey));
} }
// cache the BitSet representing an event since it's otherwise created a massive amount of times // cache the key representing an event since it's otherwise created a massive amount of times
private static final Map<EventType, BitSet> eventTypeCache = new HashMap<EventType, BitSet>(EventType.values().length); private static final long[] eventTypeCache = new long[EventType.values().length]; // event IDs must be longs so that bit-fiddling works
static { static {
for (final EventType eventType : EventType.values()) for (final EventType eventType : EventType.values())
eventTypeCache.put(eventType, BitSetUtils.bitSetFrom(eventType.index)); eventTypeCache[eventType.index] = (long)eventType.index;
} }
private BitSet bitSetFromEvent(final EventType eventType) { private long keyFromEvent(final EventType eventType) {
return eventTypeCache.get(eventType); return eventTypeCache[eventType.index];
}
private BitSet bitSetFromId(final short id) {
return BitSetUtils.bitSetFrom(id);
}
private int bitsInEventType() {
return BitSetUtils.numberOfBitsToRepresent(EventType.values().length);
}
private void addBitSetToKeyAtLocation(BitSet key, BitSet bitSet, int location) {
for (int j = bitSet.nextSetBit(0); j >= 0; j = bitSet.nextSetBit(j + 1))
key.set(j + location); // translate the bits set in the key to their corresponding position in the full key
}
private BitSet extractBitSetFromKey (BitSet key, BitSet mask, int leadingBits) {
BitSet bitSet = (BitSet) key.clone();
bitSet.and(mask);
return chopNBitsFrom(bitSet, leadingBits);
} }
@Override @Override
@ -297,22 +252,22 @@ public class BQSRKeyManager {
if (this == other) if (this == other)
return true; return true;
if (requiredCovariatesInfo.size() != other.requiredCovariatesInfo.size() || if (requiredCovariatesInfo.length != other.requiredCovariatesInfo.length ||
optionalCovariatesInfo.size() != other.optionalCovariatesInfo.size()) optionalCovariatesInfo.length != other.optionalCovariatesInfo.length)
return false; return false;
for (int i = 0; i < requiredCovariates.size(); i++) { for (int i = 0; i < requiredCovariates.length; i++) {
Covariate myRequiredCovariate = requiredCovariates.get(i); Covariate myRequiredCovariate = requiredCovariates[i];
Covariate otherRequiredCovariate = other.requiredCovariates.get(i); Covariate otherRequiredCovariate = other.requiredCovariates[i];
String thisName = myRequiredCovariate.getClass().getSimpleName(); String thisName = myRequiredCovariate.getClass().getSimpleName();
String otherName = otherRequiredCovariate.getClass().getSimpleName(); String otherName = otherRequiredCovariate.getClass().getSimpleName();
if (!thisName.equals(otherName)) if (!thisName.equals(otherName))
return false; return false;
} }
for (int i = 0; i < optionalCovariates.size(); i++) { for (int i = 0; i < optionalCovariates.length; i++) {
Covariate myOptionalCovariate = optionalCovariates.get(i); Covariate myOptionalCovariate = optionalCovariates[i];
Covariate otherOptionalCovariate = other.optionalCovariates.get(i); Covariate otherOptionalCovariate = other.optionalCovariates[i];
String thisName = myOptionalCovariate.getClass().getSimpleName(); String thisName = myOptionalCovariate.getClass().getSimpleName();
String otherName = otherOptionalCovariate.getClass().getSimpleName(); String otherName = otherOptionalCovariate.getClass().getSimpleName();
if (!thisName.equals(otherName)) if (!thisName.equals(otherName))
@ -322,27 +277,50 @@ public class BQSRKeyManager {
return true; return true;
} }
/**
* Calculates the number of bits necessary to represent a given number of elements
*
* @param numberOfElements the number of elements to represent (must be positive)
* @return the number of bits necessary to represent this many elements
*/
public static int numberOfBitsToRepresent(long numberOfElements) {
if (numberOfElements < 0)
throw new ReviewedStingException("Number of elements must be positive: " + numberOfElements);
if (numberOfElements == 1L)
return 1; // special case
int n = 0;
numberOfElements--;
while (numberOfElements > 0) {
numberOfElements = numberOfElements >> 1;
n++;
}
return n;
}
/** /**
* Aggregate information for each Covariate * Aggregate information for each Covariate
*/ */
class RequiredCovariateInfo { private static class RequiredCovariateInfo {
public final int bitsBefore; // number of bits before this covariate in the combined bitset key public final int nBits; // number of bits for this key
public final BitSet mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits ) public final int offset; // the offset into the master key
public final long mask; // the mask to pull out this covariate from the combined bitset key ( a mask made from bitsBefore and nBits )
public final Covariate covariate; // this allows reverse lookup of the Covariates in order public final Covariate covariate; // this allows reverse lookup of the Covariates in order
RequiredCovariateInfo(int bitsBefore, BitSet mask, Covariate covariate) { RequiredCovariateInfo(final int nBits, final int offset, final long mask, final Covariate covariate) {
this.bitsBefore = bitsBefore; this.nBits = nBits;
this.offset = offset;
this.mask = mask; this.mask = mask;
this.covariate = covariate; this.covariate = covariate;
} }
} }
class OptionalCovariateInfo { private static class OptionalCovariateInfo {
public final BitSet covariateID; // cache the covariate ID public final long covariateID; // cache the covariate ID (must be a long so that bit-fiddling works)
public final Covariate covariate; public final Covariate covariate;
OptionalCovariateInfo(BitSet covariateID, Covariate covariate) { OptionalCovariateInfo(final long covariateID, final Covariate covariate) {
this.covariateID = covariateID; this.covariateID = covariateID;
this.covariate = covariate; this.covariate = covariate;
} }

View File

@ -26,15 +26,12 @@
package org.broadinstitute.sting.gatk.walkers.bqsr; package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; import org.broadinstitute.sting.utils.clipping.ClippingRepresentation;
import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
import java.util.BitSet;
/** /**
* Created by IntelliJ IDEA. * Created by IntelliJ IDEA.
* User: rpoplin * User: rpoplin
@ -44,8 +41,7 @@ import java.util.BitSet;
public class ContextCovariate implements StandardCovariate { public class ContextCovariate implements StandardCovariate {
private int mismatchesContextSize; private int mismatchesContextSize;
private int insertionsContextSize; private int indelsContextSize;
private int deletionsContextSize;
private byte LOW_QUAL_TAIL; private byte LOW_QUAL_TAIL;
@ -53,42 +49,33 @@ public class ContextCovariate implements StandardCovariate {
@Override @Override
public void initialize(final RecalibrationArgumentCollection RAC) { public void initialize(final RecalibrationArgumentCollection RAC) {
mismatchesContextSize = RAC.MISMATCHES_CONTEXT_SIZE; mismatchesContextSize = RAC.MISMATCHES_CONTEXT_SIZE;
insertionsContextSize = RAC.INSERTIONS_CONTEXT_SIZE; indelsContextSize = RAC.INDELS_CONTEXT_SIZE;
deletionsContextSize = RAC.DELETIONS_CONTEXT_SIZE; if (mismatchesContextSize > MAX_DNA_CONTEXT)
throw new UserException.BadArgumentValue("mismatches_context_size", String.format("context size cannot be bigger than %d, but was %d", MAX_DNA_CONTEXT, mismatchesContextSize));
if (indelsContextSize > MAX_DNA_CONTEXT)
throw new UserException.BadArgumentValue("indels_context_size", String.format("context size cannot be bigger than %d, but was %d", MAX_DNA_CONTEXT, indelsContextSize));
LOW_QUAL_TAIL = RAC.LOW_QUAL_TAIL; LOW_QUAL_TAIL = RAC.LOW_QUAL_TAIL;
if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0) if (mismatchesContextSize <= 0 || indelsContextSize <= 0)
throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize)); throw new UserException(String.format("Context size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Indels: %d", mismatchesContextSize, indelsContextSize));
} }
@Override @Override
public CovariateValues getValues(final GATKSAMRecord read) { public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
int l = read.getReadLength();
BitSet[] mismatches = new BitSet[l];
BitSet[] insertions = new BitSet[l];
BitSet[] deletions = new BitSet[l];
GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); // Write N's over the low quality tail of the reads to avoid adding them into the context final GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); // Write N's over the low quality tail of the reads to avoid adding them into the context
final boolean negativeStrand = clippedRead.getReadNegativeStrandFlag(); final boolean negativeStrand = clippedRead.getReadNegativeStrandFlag();
byte[] bases = clippedRead.getReadBases(); byte[] bases = clippedRead.getReadBases();
if (negativeStrand) if (negativeStrand)
bases = BaseUtils.simpleReverseComplement(bases); bases = BaseUtils.simpleReverseComplement(bases);
for (int i = 0; i < clippedRead.getReadLength(); i++) { final int readLength = clippedRead.getReadLength();
mismatches[i] = contextWith(bases, i, mismatchesContextSize); for (int i = 0; i < readLength; i++) {
insertions[i] = contextWith(bases, i, insertionsContextSize); final long indelKey = contextWith(bases, i, indelsContextSize);
deletions[i] = contextWith(bases, i, deletionsContextSize); values.addCovariate(contextWith(bases, i, mismatchesContextSize), indelKey, indelKey, (negativeStrand ? readLength - i - 1 : i));
} }
if (negativeStrand) {
reverse(mismatches);
reverse(insertions);
reverse(deletions);
}
return new CovariateValues(mismatches, insertions, deletions);
} }
// Used to get the covariate's value from input csv file during on-the-fly recalibration // Used to get the covariate's value from input csv file during on-the-fly recalibration
@ -98,21 +85,21 @@ public class ContextCovariate implements StandardCovariate {
} }
@Override @Override
public String keyFromBitSet(BitSet key) { public String formatKey(final long key) {
if (key == null) // this can only happen in test routines because we do not propagate null keys to the csv file if (key == -1) // this can only happen in test routines because we do not propagate null keys to the csv file
return null; return null;
return BitSetUtils.dnaFrom(key); return contextFromKey(key);
} }
@Override @Override
public BitSet bitSetFromKey(Object key) { public long longFromKey(Object key) {
return BitSetUtils.bitSetFrom((String) key); return keyFromContext((String) key);
} }
@Override @Override
public int numberOfBits() { public int numberOfBits() {
return Long.bitCount(-1L); return Integer.bitCount(Integer.MAX_VALUE);
} }
/** /**
@ -121,29 +108,132 @@ public class ContextCovariate implements StandardCovariate {
* @param bases the bases in the read to build the context from * @param bases the bases in the read to build the context from
* @param offset the position in the read to calculate the context for * @param offset the position in the read to calculate the context for
* @param contextSize context size to use building the context * @param contextSize context size to use building the context
* @return the bitSet representing the Context * @return the key representing the context
*/ */
private BitSet contextWith(byte[] bases, int offset, int contextSize) { private long contextWith(final byte[] bases, final int offset, final int contextSize) {
BitSet result = null; final int start = offset - contextSize + 1;
if (offset - contextSize + 1 >= 0) { final long result;
final byte[] context = Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1); if (start >= 0)
if (!BaseUtils.containsBase(context, BaseUtils.N)) result = keyFromContext(bases, start, offset + 1);
result = BitSetUtils.bitSetFrom(context); else
} result = -1L;
return result; return result;
} }
public static long keyFromContext(final String dna) {
return keyFromContext(dna.getBytes(), 0, dna.length());
}
/** /**
* Reverses the given array in place. * Creates a long representation of a given dna string.
* *
* @param array any array * Warning: This conversion is limited to long precision, therefore the dna sequence cannot
* be longer than 31 bases.
*
* The bit representation of a dna string is the simple:
* 0 A 4 AA 8 CA
* 1 C 5 AC ...
* 2 G 6 AG 1343 TTGGT
* 3 T 7 AT 1364 TTTTT
*
* To convert from dna to number, we convert the dna string to base10 and add all combinations that
* preceded the string (with smaller lengths).
*
* @param dna the dna sequence
* @return the key representing the dna sequence
*/ */
private static void reverse(final Object[] array) { public static long keyFromContext(final byte[] dna, final int start, final int end) {
final int arrayLength = array.length; final long preContext = combinationsPerLength[end - start - 1]; // the sum of all combinations that preceded the length of the dna string
for (int l = 0, r = arrayLength - 1; l < r; l++, r--) { long baseTen = 0L; // the number in base_10 that we are going to use to generate the bit set
final Object temp = array[l]; for (int i = start; i < end; i++) {
array[l] = array[r]; baseTen = (baseTen << 2); // multiply by 4
array[r] = temp; final int baseIndex = BaseUtils.simpleBaseToBaseIndex(dna[i]);
if (baseIndex == -1) // ignore non-ACGT bases
return -1L;
baseTen += (long)baseIndex;
} }
return baseTen + preContext; // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
}
static final private int MAX_DNA_CONTEXT = 31; // the maximum context size (number of bases) permitted in the "long bitset" implementation of the DNA <=> BitSet conversion.
static final long[] combinationsPerLength = new long[MAX_DNA_CONTEXT + 1]; // keeps the memoized table with the number of combinations for each given DNA context length
static {
for (int i = 0; i < MAX_DNA_CONTEXT + 1; i++)
computeCombinationsFor(i);
}
/**
* The sum of all combinations of a context of a given length from length = 0 to length.
*
* Memoized implementation of sum(4^i) , where i=[0,length]
*
* @param length the length of the DNA context
*/
private static void computeCombinationsFor(final int length) {
long combinations = 0L;
for (int i = 1; i <= length; i++)
combinations += (1L << 2 * i); // add all combinations with 4^i ( 4^i is the same as 2^(2*i) )
combinationsPerLength[length] = combinations;
}
/**
* Converts a key into the dna string representation.
*
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
* be longer than 31 bases.
*
* We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the
* base_10 representation of the sequence. This is important for us to know how to bring the number
* to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented
* as 0's and leading 0's are omitted).
*
* quasi-canonical because A is represented by a 0, therefore,
* instead of : 0, 1, 2, 3, 10, 11, 12, ...
* we have : 0, 1, 2, 3, 00, 01, 02, ...
*
* but we can correctly decode it because we know the final length.
*
* @param key the key representing the dna sequence
* @return the dna sequence represented by the key
*/
public static String contextFromKey(long key) {
if (key < 0)
throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?");
final int length = contextLengthFor(key); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
key -= combinationsPerLength[length - 1]; // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
StringBuilder dna = new StringBuilder();
while (key > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
final byte base = (byte) (key & 3); // equivalent to (key % 4)
dna.append((char)BaseUtils.baseIndexToSimpleBase(base));
key = key >> 2; // divide by 4
}
for (int j = dna.length(); j < length; j++)
dna.append('A'); // add leading A's as necessary (due to the "quasi" canonical status, see description above)
return dna.reverse().toString(); // make sure to reverse the string since we should have been pre-pending all along
}
/**
* Calculates the length of the DNA context for a given base 10 number
*
* It is important to know the length given the base 10 number to calculate the number of combinations
* and to disambiguate the "quasi-canonical" state.
*
* This method also calculates the number of combinations as a by-product, but since it memoizes the
* results, a subsequent call to combinationsFor(length) is O(1).
*
* @param number the base 10 representation of the key
* @return the length of the DNA context represented by this number
*/
private static int contextLengthFor(final long number) {
int length = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet.
long combinations = combinationsPerLength[length]; // the next context (we advance it so we know which one was preceding it).
while (combinations <= number) { // find the length of the dna string (length)
length++;
combinations = combinationsPerLength[length]; // calculate the next context
}
return length;
} }
} }

View File

@ -2,8 +2,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.BitSet;
/* /*
* Copyright (c) 2009 The Broad Institute * Copyright (c) 2009 The Broad Institute
* *
@ -45,15 +43,15 @@ public interface Covariate {
* *
* @param RAC the recalibration argument collection * @param RAC the recalibration argument collection
*/ */
public void initialize(RecalibrationArgumentCollection RAC); public void initialize(final RecalibrationArgumentCollection RAC);
/** /**
* Calculates covariate values for all positions in the read. * Calculates covariate values for all positions in the read.
* *
* @param read the read to calculate the covariates on. * @param read the read to calculate the covariates on.
* @return all the covariate values for every base in the read. * @param values the object to record the covariate values for every base in the read.
*/ */
public CovariateValues getValues(GATKSAMRecord read); public void recordValues(final GATKSAMRecord read, final ReadCovariates values);
/** /**
* Used to get the covariate's value from input csv file during on-the-fly recalibration * Used to get the covariate's value from input csv file during on-the-fly recalibration
@ -61,26 +59,26 @@ public interface Covariate {
* @param str the key in string type (read from the csv) * @param str the key in string type (read from the csv)
* @return the key in it's correct type. * @return the key in it's correct type.
*/ */
public Object getValue(String str); public Object getValue(final String str);
/** /**
* Converts the bitset representation of the key (used internally for table indexing) to String format for file output. * Converts the internal representation of the key to String format for file output.
* *
* @param key the bitset representation of the key * @param key the long representation of the key
* @return a string representation of the key * @return a string representation of the key
*/ */
public String keyFromBitSet(BitSet key); public String formatKey(final long key);
/** /**
* Converts a key into a bitset * Converts an Object key into a long key using only the lowest numberOfBits() bits
* *
* Only necessary for on-the-fly recalibration when you have the object, but need to store it in memory in bitset format. For counting covariates * Only necessary for on-the-fly recalibration when you have the object, but need to store it in memory in long format. For counting covariates
* the getValues method already returns all values in BitSet format. * the getValues method already returns all values in long format.
* *
* @param key the object corresponding to the covariate * @param key the object corresponding to the covariate
* @return a bitset representation of the object * @return a long representation of the object
*/ */
public BitSet bitSetFromKey(Object key); public long longFromKey(final Object key);
/** /**
* Each covariate should determine how many bits are necessary to encode it's data * Each covariate should determine how many bits are necessary to encode it's data

View File

@ -1,39 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import java.util.BitSet;
/**
* An object to hold the different covariate values for all bases in the read.
*
* Currently we have three different covariates for each read:
* - Mismatch
* - Insertion
* - Deletion
*
* @author Mauricio Carneiro
* @since 2/8/12
*/
public class CovariateValues {
private final BitSet[] mismatches;
private final BitSet[] insertions;
private final BitSet[] deletions;
public CovariateValues(BitSet[] mismatch, BitSet[] insertion, BitSet[] deletion) {
this.mismatches = mismatch;
this.insertions = insertion;
this.deletions = deletion;
}
public BitSet[] getMismatches() {
return mismatches;
}
public BitSet[] getInsertions() {
return insertions;
}
public BitSet[] getDeletions() {
return deletions;
}
}

View File

@ -1,12 +1,10 @@
package org.broadinstitute.sting.gatk.walkers.bqsr; package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.NGSPlatform;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.BitSet;
import java.util.EnumSet; import java.util.EnumSet;
/* /*
@ -60,18 +58,18 @@ public class CycleCovariate implements StandardCovariate {
// Used to pick out the covariate's value from attributes of the read // Used to pick out the covariate's value from attributes of the read
@Override @Override
public CovariateValues getValues(final GATKSAMRecord read) { public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
BitSet[] cycles = new BitSet[read.getReadLength()]; final int readLength = read.getReadLength();
final NGSPlatform ngsPlatform = read.getNGSPlatform(); final NGSPlatform ngsPlatform = read.getNGSPlatform();
// Discrete cycle platforms // Discrete cycle platforms
if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) { if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) {
final short readOrderFactor = read.getReadPairedFlag() && read.getSecondOfPairFlag() ? (short) -1 : 1; final int readOrderFactor = read.getReadPairedFlag() && read.getSecondOfPairFlag() ? -1 : 1;
final short increment; final int increment;
short cycle; int cycle;
if (read.getReadNegativeStrandFlag()) { if (read.getReadNegativeStrandFlag()) {
cycle = (short) (read.getReadLength() * readOrderFactor); cycle = readLength * readOrderFactor;
increment = (short) (-1 * readOrderFactor); increment = -1 * readOrderFactor;
} }
else { else {
cycle = readOrderFactor; cycle = readOrderFactor;
@ -79,9 +77,10 @@ public class CycleCovariate implements StandardCovariate {
} }
final int CUSHION = 4; final int CUSHION = 4;
final int MAX_CYCLE = read.getReadLength() - CUSHION - 1; final int MAX_CYCLE = readLength - CUSHION - 1;
for (int i = 0; i < MAX_CYCLE; i++) { for (int i = 0; i < readLength; i++) {
cycles[i] = (i<CUSHION || i>MAX_CYCLE) ? null : BitSetUtils.bitSetFrom(cycle); final long key = (i<CUSHION || i>MAX_CYCLE) ? -1L : keyFromCycle(cycle);
values.addCovariate(key, key, key, i);
cycle += increment; cycle += increment;
} }
} }
@ -89,7 +88,6 @@ public class CycleCovariate implements StandardCovariate {
// Flow cycle platforms // Flow cycle platforms
else if (FLOW_CYCLE_PLATFORMS.contains(ngsPlatform)) { else if (FLOW_CYCLE_PLATFORMS.contains(ngsPlatform)) {
final int readLength = read.getReadLength();
final byte[] bases = read.getReadBases(); final byte[] bases = read.getReadBases();
// Differentiate between first and second of pair. // Differentiate between first and second of pair.
@ -100,7 +98,7 @@ public class CycleCovariate implements StandardCovariate {
// the current sequential model would consider the effects independently instead of jointly. // the current sequential model would consider the effects independently instead of jointly.
final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag();
short cycle = multiplyByNegative1 ? (short) -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms. int cycle = multiplyByNegative1 ? -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms.
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one // For example, AAAAAAA was probably read in two flow cycles but here we count it as one
@ -108,19 +106,23 @@ public class CycleCovariate implements StandardCovariate {
int iii = 0; int iii = 0;
while (iii < readLength) { while (iii < readLength) {
while (iii < readLength && bases[iii] == (byte) 'T') { while (iii < readLength && bases[iii] == (byte) 'T') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle); final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++; iii++;
} }
while (iii < readLength && bases[iii] == (byte) 'A') { while (iii < readLength && bases[iii] == (byte) 'A') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle); final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++; iii++;
} }
while (iii < readLength && bases[iii] == (byte) 'C') { while (iii < readLength && bases[iii] == (byte) 'C') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle); final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++; iii++;
} }
while (iii < readLength && bases[iii] == (byte) 'G') { while (iii < readLength && bases[iii] == (byte) 'G') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle); final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++; iii++;
} }
if (iii < readLength) { if (iii < readLength) {
@ -130,7 +132,8 @@ public class CycleCovariate implements StandardCovariate {
cycle++; cycle++;
} }
if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) { if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) {
cycles[iii] = BitSetUtils.bitSetFrom(cycle); final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii++; iii++;
} }
@ -140,19 +143,23 @@ public class CycleCovariate implements StandardCovariate {
int iii = readLength - 1; int iii = readLength - 1;
while (iii >= 0) { while (iii >= 0) {
while (iii >= 0 && bases[iii] == (byte) 'T') { while (iii >= 0 && bases[iii] == (byte) 'T') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle); final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--; iii--;
} }
while (iii >= 0 && bases[iii] == (byte) 'A') { while (iii >= 0 && bases[iii] == (byte) 'A') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle); final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--; iii--;
} }
while (iii >= 0 && bases[iii] == (byte) 'C') { while (iii >= 0 && bases[iii] == (byte) 'C') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle); final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--; iii--;
} }
while (iii >= 0 && bases[iii] == (byte) 'G') { while (iii >= 0 && bases[iii] == (byte) 'G') {
cycles[iii] = BitSetUtils.bitSetFrom(cycle); final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--; iii--;
} }
if (iii >= 0) { if (iii >= 0) {
@ -162,7 +169,8 @@ public class CycleCovariate implements StandardCovariate {
cycle++; cycle++;
} }
if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) { if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) {
cycles[iii] = BitSetUtils.bitSetFrom(cycle); final long key = keyFromCycle(cycle);
values.addCovariate(key, key, key, iii);
iii--; iii--;
} }
} }
@ -173,28 +181,38 @@ public class CycleCovariate implements StandardCovariate {
else { else {
throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid"); throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid");
} }
return new CovariateValues(cycles, cycles, cycles);
} }
// Used to get the covariate's value from input csv file during on-the-fly recalibration // Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override @Override
public final Object getValue(final String str) { public final Object getValue(final String str) {
return Short.parseShort(str); return Integer.parseInt(str);
} }
@Override @Override
public String keyFromBitSet(BitSet key) { public String formatKey(final long key) {
return String.format("%d", BitSetUtils.shortFrom(key)); long cycle = key >> 1; // shift so we can remove the "sign" bit
if ( (key & 1) != 0 ) // is the last bit set?
cycle *= -1; // then the cycle is negative
return String.format("%d", cycle);
} }
@Override @Override
public BitSet bitSetFromKey(Object key) { public long longFromKey(final Object key) {
return (key instanceof String) ? BitSetUtils.bitSetFrom(Short.parseShort((String) key)) : BitSetUtils.bitSetFrom((Short) key); return (key instanceof String) ? keyFromCycle(Integer.parseInt((String) key)) : keyFromCycle((Integer) key);
} }
@Override @Override
public int numberOfBits() { public int numberOfBits() {
return BitSetUtils.numberOfBitsToRepresent(2 * Short.MAX_VALUE); // positive and negative return Integer.bitCount(Integer.MAX_VALUE);
}
private static long keyFromCycle(final int cycle) {
// no negative values because values must fit into the first few bits of the long
long result = Math.abs(cycle);
result = result << 1; // shift so we can add the "sign" bit
if ( cycle < 0 )
result++; // negative cycles get the lower-most bit set
return result;
} }
} }

View File

@ -1,11 +1,8 @@
package org.broadinstitute.sting.gatk.walkers.bqsr; package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.BitSet;
/* /*
* Copyright (c) 2009 The Broad Institute * Copyright (c) 2009 The Broad Institute
* *
@ -43,28 +40,17 @@ public class QualityScoreCovariate implements RequiredCovariate {
// Initialize any member variables using the command-line arguments passed to the walkers // Initialize any member variables using the command-line arguments passed to the walkers
@Override @Override
public void initialize(final RecalibrationArgumentCollection RAC) { public void initialize(final RecalibrationArgumentCollection RAC) {}
}
@Override @Override
public CovariateValues getValues(final GATKSAMRecord read) { public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
int readLength = read.getReadLength(); final byte[] baseQualities = read.getBaseQualities();
final byte[] baseInsertionQualities = read.getBaseInsertionQualities();
BitSet[] mismatches = new BitSet[readLength]; final byte[] baseDeletionQualities = read.getBaseDeletionQualities();
BitSet[] insertions = new BitSet[readLength];
BitSet[] deletions = new BitSet[readLength];
byte[] baseQualities = read.getBaseQualities();
byte[] baseInsertionQualities = read.getBaseInsertionQualities();
byte[] baseDeletionQualities = read.getBaseDeletionQualities();
for (int i = 0; i < baseQualities.length; i++) { for (int i = 0; i < baseQualities.length; i++) {
mismatches[i] = BitSetUtils.bitSetFrom(baseQualities[i]); values.addCovariate((long)baseQualities[i], (long)baseInsertionQualities[i], (long)baseDeletionQualities[i], i);
insertions[i] = BitSetUtils.bitSetFrom(baseInsertionQualities[i]);
deletions[i] = BitSetUtils.bitSetFrom(baseDeletionQualities[i]);
} }
return new CovariateValues(mismatches, insertions, deletions);
} }
// Used to get the covariate's value from input csv file during on-the-fly recalibration // Used to get the covariate's value from input csv file during on-the-fly recalibration
@ -74,17 +60,17 @@ public class QualityScoreCovariate implements RequiredCovariate {
} }
@Override @Override
public String keyFromBitSet(BitSet key) { public String formatKey(final long key) {
return String.format("%d", BitSetUtils.longFrom(key)); return String.format("%d", key);
} }
@Override @Override
public BitSet bitSetFromKey(Object key) { public long longFromKey(final Object key) {
return (key instanceof String) ? BitSetUtils.bitSetFrom(Byte.parseByte((String) key)) : BitSetUtils.bitSetFrom((Byte) key); return (key instanceof String) ? (long)Byte.parseByte((String) key) : (long)(Byte) key;
} }
@Override @Override
public int numberOfBits() { public int numberOfBits() {
return BitSetUtils.numberOfBitsToRepresent(QualityUtils.MAX_QUAL_SCORE); return BQSRKeyManager.numberOfBitsToRepresent(QualityUtils.MAX_QUAL_SCORE);
} }
} }

View File

@ -6,7 +6,6 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.recalibration.QualQuantizer; import org.broadinstitute.sting.utils.recalibration.QualQuantizer;
import java.util.Arrays; import java.util.Arrays;
import java.util.BitSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -31,15 +30,15 @@ public class QuantizationInfo {
this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals)); this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals));
} }
public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int quantizationLevels) { public QuantizationInfo(Map<BQSRKeyManager, Map<Long, RecalDatum>> keysAndTablesMap, int quantizationLevels) {
final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution
for (int i = 0; i < qualHistogram.length; i++) for (int i = 0; i < qualHistogram.length; i++)
qualHistogram[i] = 0L; qualHistogram[i] = 0L;
Map<BitSet, RecalDatum> qualTable = null; // look for the quality score table Map<Long, RecalDatum> qualTable = null; // look for the quality score table
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) { for (Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
BQSRKeyManager keyManager = entry.getKey(); BQSRKeyManager keyManager = entry.getKey();
if (keyManager.getRequiredCovariates().size() == 2) // it should be the only one with 2 required covaraites if (keyManager.getNumRequiredCovariates() == 2) // it should be the only one with 2 required covariates
qualTable = entry.getValue(); qualTable = entry.getValue();
} }

View File

@ -2,8 +2,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.BitSet;
/** /**
* The object temporarily held by a read that describes all of it's covariates. * The object temporarily held by a read that describes all of it's covariates.
* *
@ -13,27 +11,29 @@ import java.util.BitSet;
* @since 2/8/12 * @since 2/8/12
*/ */
public class ReadCovariates { public class ReadCovariates {
private final BitSet[][] mismatchesKeySet; private final long[][] mismatchesKeySet;
private final BitSet[][] insertionsKeySet; private final long[][] insertionsKeySet;
private final BitSet[][] deletionsKeySet; private final long[][] deletionsKeySet;
private int nextCovariateIndex; private int currentCovariateIndex = 0;
public ReadCovariates(int readLength, int numberOfCovariates) { public ReadCovariates(int readLength, int numberOfCovariates) {
this.mismatchesKeySet = new BitSet[readLength][numberOfCovariates]; this.mismatchesKeySet = new long[readLength][numberOfCovariates];
this.insertionsKeySet = new BitSet[readLength][numberOfCovariates]; this.insertionsKeySet = new long[readLength][numberOfCovariates];
this.deletionsKeySet = new BitSet[readLength][numberOfCovariates]; this.deletionsKeySet = new long[readLength][numberOfCovariates];
this.nextCovariateIndex = 0;
} }
public void addCovariate(CovariateValues covariate) { public void setCovariateIndex(final int index) {
transposeCovariateValues(mismatchesKeySet, covariate.getMismatches()); currentCovariateIndex = index;
transposeCovariateValues(insertionsKeySet, covariate.getInsertions());
transposeCovariateValues(deletionsKeySet, covariate.getDeletions());
nextCovariateIndex++;
} }
public BitSet[] getKeySet(final int readPosition, final EventType errorModel) { public void addCovariate(final long mismatch, final long insertion, final long deletion, final int readOffset) {
mismatchesKeySet[readOffset][currentCovariateIndex] = mismatch;
insertionsKeySet[readOffset][currentCovariateIndex] = insertion;
deletionsKeySet[readOffset][currentCovariateIndex] = deletion;
}
public long[] getKeySet(final int readPosition, final EventType errorModel) {
switch (errorModel) { switch (errorModel) {
case BASE_SUBSTITUTION: case BASE_SUBSTITUTION:
return getMismatchesKeySet(readPosition); return getMismatchesKeySet(readPosition);
@ -46,35 +46,30 @@ public class ReadCovariates {
} }
} }
public BitSet[] getMismatchesKeySet(int readPosition) { public long[] getMismatchesKeySet(final int readPosition) {
return mismatchesKeySet[readPosition]; return mismatchesKeySet[readPosition];
} }
public BitSet[] getInsertionsKeySet(int readPosition) { public long[] getInsertionsKeySet(final int readPosition) {
return insertionsKeySet[readPosition]; return insertionsKeySet[readPosition];
} }
public BitSet[] getDeletionsKeySet(int readPosition) { public long[] getDeletionsKeySet(final int readPosition) {
return deletionsKeySet[readPosition]; return deletionsKeySet[readPosition];
} }
private void transposeCovariateValues(BitSet[][] keySet, BitSet[] covariateValues) {
for (int i = 0; i < covariateValues.length; i++)
keySet[i][nextCovariateIndex] = covariateValues[i];
}
/** /**
* Testing routines * Testing routines
*/ */
protected BitSet[][] getMismatchesKeySet() { protected long[][] getMismatchesKeySet() {
return mismatchesKeySet; return mismatchesKeySet;
} }
protected BitSet[][] getInsertionsKeySet() { protected long[][] getInsertionsKeySet() {
return insertionsKeySet; return insertionsKeySet;
} }
protected BitSet[][] getDeletionsKeySet() { protected long[][] getDeletionsKeySet() {
return deletionsKeySet; return deletionsKeySet;
} }
} }

View File

@ -1,11 +1,8 @@
package org.broadinstitute.sting.gatk.walkers.bqsr; package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BitSetUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap; import java.util.HashMap;
/* /*
@ -43,23 +40,22 @@ import java.util.HashMap;
public class ReadGroupCovariate implements RequiredCovariate { public class ReadGroupCovariate implements RequiredCovariate {
private final HashMap<String, Short> readGroupLookupTable = new HashMap<String, Short>(); private final HashMap<String, Long> readGroupLookupTable = new HashMap<String, Long>();
private final HashMap<Short, String> readGroupReverseLookupTable = new HashMap<Short, String>(); private final HashMap<Long, String> readGroupReverseLookupTable = new HashMap<Long, String>();
private short nextId = 0; private long nextId = 0L;
// Initialize any member variables using the command-line arguments passed to the walkers // Initialize any member variables using the command-line arguments passed to the walkers
@Override @Override
public void initialize(final RecalibrationArgumentCollection RAC) { public void initialize(final RecalibrationArgumentCollection RAC) {}
}
@Override @Override
public CovariateValues getValues(final GATKSAMRecord read) { public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
final int l = read.getReadLength();
final String readGroupId = readGroupValueFromRG(read.getReadGroup()); final String readGroupId = readGroupValueFromRG(read.getReadGroup());
BitSet rg = bitSetForReadGroup(readGroupId); // All objects must output a BitSet, so we convert the "compressed" representation of the Read Group into a bitset final long key = keyForReadGroup(readGroupId);
BitSet[] readGroups = new BitSet[l];
Arrays.fill(readGroups, rg); final int l = read.getReadLength();
return new CovariateValues(readGroups, readGroups, readGroups); for (int i = 0; i < l; i++)
values.addCovariate(key, key, key, i);
} }
@Override @Override
@ -68,35 +64,28 @@ public class ReadGroupCovariate implements RequiredCovariate {
} }
@Override @Override
public String keyFromBitSet(BitSet key) { public String formatKey(final long key) {
return decodeReadGroup((short) BitSetUtils.longFrom(key)); return readGroupReverseLookupTable.get(key);
} }
@Override @Override
public BitSet bitSetFromKey(Object key) { public long longFromKey(Object key) {
return bitSetForReadGroup((String) key); return keyForReadGroup((String) key);
} }
@Override @Override
public int numberOfBits() { public int numberOfBits() {
return BitSetUtils.numberOfBitsToRepresent(Short.MAX_VALUE); return BQSRKeyManager.numberOfBitsToRepresent(Short.MAX_VALUE);
} }
private String decodeReadGroup(final short id) { private long keyForReadGroup(final String readGroupId) {
return readGroupReverseLookupTable.get(id); if (!readGroupLookupTable.containsKey(readGroupId)) {
}
private BitSet bitSetForReadGroup(String readGroupId) {
short shortId;
if (readGroupLookupTable.containsKey(readGroupId))
shortId = readGroupLookupTable.get(readGroupId);
else {
shortId = nextId;
readGroupLookupTable.put(readGroupId, nextId); readGroupLookupTable.put(readGroupId, nextId);
readGroupReverseLookupTable.put(nextId, readGroupId); readGroupReverseLookupTable.put(nextId, readGroupId);
nextId++; nextId++;
} }
return BitSetUtils.bitSetFrom(shortId);
return readGroupLookupTable.get(readGroupId);
} }
/** /**
@ -105,8 +94,8 @@ public class ReadGroupCovariate implements RequiredCovariate {
* @param rg the read group record * @param rg the read group record
* @return platform unit or readgroup id * @return platform unit or readgroup id
*/ */
private String readGroupValueFromRG(GATKSAMReadGroupRecord rg) { private String readGroupValueFromRG(final GATKSAMReadGroupRecord rg) {
String platformUnit = rg.getPlatformUnit(); final String platformUnit = rg.getPlatformUnit();
return platformUnit == null ? rg.getId() : platformUnit; return platformUnit == null ? rg.getId() : platformUnit;
} }

View File

@ -149,17 +149,17 @@ public class RecalDataManager {
* @param optionalCovariates list of optional covariates (in order) * @param optionalCovariates list of optional covariates (in order)
* @return a map with each key manager and it's corresponding recalibration table properly initialized * @return a map with each key manager and it's corresponding recalibration table properly initialized
*/ */
public static LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> initializeTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) { public static LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> initializeTables(ArrayList<Covariate> requiredCovariates, ArrayList<Covariate> optionalCovariates) {
final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>(); final LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> tablesAndKeysMap = new LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates. final ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size() + 1); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables final ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) { for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate); requiredCovariatesToAdd.add(covariate);
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(); // initializing a new recal table for each required covariate (cumulatively) final Map<Long, RecalDatum> recalTable = new HashMap<Long, RecalDatum>(); // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
} }
final Map<BitSet, RecalDatum> recalTable = new HashMap<BitSet, RecalDatum>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates final Map<Long, RecalDatum> recalTable = new HashMap<Long, RecalDatum>(Short.MAX_VALUE); // initializing a new recal table to hold all optional covariates
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map tablesAndKeysMap.put(keyManager, recalTable); // adding the pair table+key to the map
return tablesAndKeysMap; return tablesAndKeysMap;
@ -181,7 +181,7 @@ public class RecalDataManager {
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>(RequiredCovariate.class).getPlugins(); final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>(RequiredCovariate.class).getPlugins();
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>(StandardCovariate.class).getPlugins(); final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>(StandardCovariate.class).getPlugins();
ArrayList<Covariate> requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates final ArrayList<Covariate> requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates
ArrayList<Covariate> optionalCovariates = new ArrayList<Covariate>(); ArrayList<Covariate> optionalCovariates = new ArrayList<Covariate>();
if (argumentCollection.USE_STANDARD_COVARIATES) if (argumentCollection.USE_STANDARD_COVARIATES)
optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user
@ -223,7 +223,7 @@ public class RecalDataManager {
logger.info(""); logger.info("");
} }
private static List<GATKReportTable> generateReportTables(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap) { private static List<GATKReportTable> generateReportTables(Map<BQSRKeyManager, Map<Long, RecalDatum>> keysAndTablesMap) {
List<GATKReportTable> result = new LinkedList<GATKReportTable>(); List<GATKReportTable> result = new LinkedList<GATKReportTable>();
int tableIndex = 0; int tableIndex = 0;
@ -235,23 +235,23 @@ public class RecalDataManager {
final Pair<String, String> nObservations = new Pair<String, String>(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); final Pair<String, String> nObservations = new Pair<String, String>(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
final Pair<String, String> nErrors = new Pair<String, String>(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d"); final Pair<String, String> nErrors = new Pair<String, String>(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d");
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> entry : keysAndTablesMap.entrySet()) { for (Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> entry : keysAndTablesMap.entrySet()) {
BQSRKeyManager keyManager = entry.getKey(); final BQSRKeyManager keyManager = entry.getKey();
Map<BitSet, RecalDatum> recalTable = entry.getValue(); final Map<Long, RecalDatum> recalTable = entry.getValue();
boolean isReadGroupTable = tableIndex == 0; // special case for the read group table so we can print the extra column it needs. final boolean isReadGroupTable = tableIndex == 0; // special case for the read group table so we can print the extra column it needs.
List<Covariate> requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table final Covariate[] requiredList = keyManager.getRequiredCovariates(); // ask the key manager what required covariates were used in this recal table
List<Covariate> optionalList = keyManager.getOptionalCovariates(); // ask the key manager what optional covariates were used in this recal table final Covariate[] optionalList = keyManager.getOptionalCovariates(); // ask the key manager what optional covariates were used in this recal table
ArrayList<Pair<String, String>> columnNames = new ArrayList<Pair<String, String>>(); // initialize the array to hold the column names final ArrayList<Pair<String, String>> columnNames = new ArrayList<Pair<String, String>>(); // initialize the array to hold the column names
for (Covariate covariate : requiredList) { for (final Covariate covariate : requiredList) {
String name = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the covariate names and put them in order final String name = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the covariate names and put them in order
columnNames.add(new Pair<String,String>(name, "%s")); // save the required covariate name so we can reference it in the future columnNames.add(new Pair<String,String>(name, "%s")); // save the required covariate name so we can reference it in the future
} }
if (optionalList.size() > 0) { if (optionalList.length > 0) {
columnNames.add(covariateValue); columnNames.add(covariateValue);
columnNames.add(covariateName); columnNames.add(covariateName);
} }
@ -263,30 +263,30 @@ public class RecalDataManager {
columnNames.add(nObservations); columnNames.add(nObservations);
columnNames.add(nErrors); columnNames.add(nErrors);
GATKReportTable reportTable = new GATKReportTable("RecalTable" + tableIndex++, "", columnNames.size()); final GATKReportTable reportTable = new GATKReportTable("RecalTable" + tableIndex++, "", columnNames.size());
for (Pair<String, String> columnName : columnNames) for (final Pair<String, String> columnName : columnNames)
reportTable.addColumn(columnName.getFirst(), columnName.getSecond()); // every table must have the event type reportTable.addColumn(columnName.getFirst(), columnName.getSecond()); // every table must have the event type
int rowIndex = 0; int rowIndex = 0;
for (Map.Entry<BitSet, RecalDatum> recalTableEntry : recalTable.entrySet()) { // create a map with column name => key value for all covariate keys for (Map.Entry<Long, RecalDatum> recalTableEntry : recalTable.entrySet()) { // create a map with column name => key value for all covariate keys
BitSet bitSetKey = recalTableEntry.getKey(); final Long bitSetKey = recalTableEntry.getKey();
Map<String, Object> columnData = new HashMap<String, Object>(columnNames.size()); final Map<String, Object> columnData = new HashMap<String, Object>(columnNames.size());
Iterator<Pair<String, String>> iterator = columnNames.iterator(); final Iterator<Pair<String, String>> iterator = columnNames.iterator();
for (Object key : keyManager.keySetFrom(bitSetKey)) { for (final Object key : keyManager.keySetFrom(bitSetKey)) {
String columnName = iterator.next().getFirst(); final String columnName = iterator.next().getFirst();
columnData.put(columnName, key); columnData.put(columnName, key);
} }
RecalDatum datum = recalTableEntry.getValue(); final RecalDatum datum = recalTableEntry.getValue();
columnData.put(iterator.next().getFirst(), datum.getEmpiricalQuality()); columnData.put(iterator.next().getFirst(), datum.getEmpiricalQuality());
if (isReadGroupTable) if (isReadGroupTable)
columnData.put(iterator.next().getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table columnData.put(iterator.next().getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table
columnData.put(iterator.next().getFirst(), datum.numObservations); columnData.put(iterator.next().getFirst(), datum.numObservations);
columnData.put(iterator.next().getFirst(), datum.numMismatches); columnData.put(iterator.next().getFirst(), datum.numMismatches);
for (Map.Entry<String, Object> dataEntry : columnData.entrySet()) { for (final Map.Entry<String, Object> dataEntry : columnData.entrySet()) {
String columnName = dataEntry.getKey(); final String columnName = dataEntry.getKey();
Object value = dataEntry.getValue(); final Object value = dataEntry.getValue();
reportTable.set(rowIndex, columnName, value.toString()); reportTable.set(rowIndex, columnName, value.toString());
} }
rowIndex++; rowIndex++;
@ -296,16 +296,16 @@ public class RecalDataManager {
return result; return result;
} }
public static void outputRecalibrationReport(RecalibrationArgumentCollection RAC, QuantizationInfo quantizationInfo, Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, PrintStream outputFile) { public static void outputRecalibrationReport(RecalibrationArgumentCollection RAC, QuantizationInfo quantizationInfo, Map<BQSRKeyManager, Map<Long, RecalDatum>> keysAndTablesMap, PrintStream outputFile) {
outputRecalibrationReport(RAC.generateReportTable(), quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile); outputRecalibrationReport(RAC.generateReportTable(), quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile);
} }
public static void outputRecalibrationReport(GATKReportTable argumentTable, QuantizationInfo quantizationInfo, LinkedHashMap<BQSRKeyManager,Map<BitSet, RecalDatum>> keysAndTablesMap, PrintStream outputFile) { public static void outputRecalibrationReport(GATKReportTable argumentTable, QuantizationInfo quantizationInfo, LinkedHashMap<BQSRKeyManager,Map<Long, RecalDatum>> keysAndTablesMap, PrintStream outputFile) {
outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile); outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(keysAndTablesMap), outputFile);
} }
private static void outputRecalibrationReport(GATKReportTable argumentTable, GATKReportTable quantizationTable, List<GATKReportTable> recalTables, PrintStream outputFile) { private static void outputRecalibrationReport(GATKReportTable argumentTable, GATKReportTable quantizationTable, List<GATKReportTable> recalTables, PrintStream outputFile) {
GATKReport report = new GATKReport(); final GATKReport report = new GATKReport();
report.addTable(argumentTable); report.addTable(argumentTable);
report.addTable(quantizationTable); report.addTable(quantizationTable);
report.addTables(recalTables); report.addTables(recalTables);
@ -328,7 +328,7 @@ public class RecalDataManager {
final File plotFileName = new File(csvFileName + ".pdf"); final File plotFileName = new File(csvFileName + ".pdf");
files.getFirst().close(); files.getFirst().close();
RScriptExecutor executor = new RScriptExecutor(); final RScriptExecutor executor = new RScriptExecutor();
executor.addScript(new Resource(SCRIPT_FILE, RecalDataManager.class)); executor.addScript(new Resource(SCRIPT_FILE, RecalDataManager.class));
executor.addArgs(csvFileName.getAbsolutePath()); executor.addArgs(csvFileName.getAbsolutePath());
executor.addArgs(plotFileName.getAbsolutePath()); executor.addArgs(plotFileName.getAbsolutePath());
@ -340,34 +340,34 @@ public class RecalDataManager {
} }
public static void generateRecalibrationPlot(File filename, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> original, boolean keepIntermediates) { public static void generateRecalibrationPlot(File filename, LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> original, boolean keepIntermediates) {
Pair<PrintStream, File> files = initializeRecalibrationPlot(filename); final Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
writeCSV(files.getFirst(), original, "ORIGINAL", true); writeCSV(files.getFirst(), original, "ORIGINAL", true);
outputRecalibrationPlot(files, keepIntermediates); outputRecalibrationPlot(files, keepIntermediates);
} }
public static void generateRecalibrationPlot(File filename, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> original, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> recalibrated, boolean keepIntermediates) { public static void generateRecalibrationPlot(File filename, LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> original, LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> recalibrated, boolean keepIntermediates) {
Pair<PrintStream, File> files = initializeRecalibrationPlot(filename); final Pair<PrintStream, File> files = initializeRecalibrationPlot(filename);
writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", true); writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", true);
writeCSV(files.getFirst(), original, "ORIGINAL", false); writeCSV(files.getFirst(), original, "ORIGINAL", false);
outputRecalibrationPlot(files, keepIntermediates); outputRecalibrationPlot(files, keepIntermediates);
} }
private static void writeCSV(PrintStream deltaTableFile, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> map, String recalibrationMode, boolean printHeader) { private static void writeCSV(PrintStream deltaTableFile, LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> map, String recalibrationMode, boolean printHeader) {
final int QUALITY_SCORE_COVARIATE_INDEX = 1; final int QUALITY_SCORE_COVARIATE_INDEX = 1;
final Map<BitSet, RecalDatum> deltaTable = new HashMap<BitSet, RecalDatum>(); final Map<Long, RecalDatum> deltaTable = new HashMap<Long, RecalDatum>();
BQSRKeyManager deltaKeyManager = null; BQSRKeyManager deltaKeyManager = null;
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> tableEntry : map.entrySet()) { for (Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> tableEntry : map.entrySet()) {
BQSRKeyManager keyManager = tableEntry.getKey(); final BQSRKeyManager keyManager = tableEntry.getKey();
if (keyManager.getOptionalCovariates().size() > 0) { // initialize with the 'all covariates' table if (keyManager.getNumOptionalCovariates() > 0) { // initialize with the 'all covariates' table
// create a key manager for the delta table // create a key manager for the delta table
final List<Covariate> requiredCovariates = Arrays.asList(keyManager.getRequiredCovariates().get(0)); // include the read group covariate as the only required covariate final List<Covariate> requiredCovariates = Arrays.asList(keyManager.getRequiredCovariates()[0]); // include the read group covariate as the only required covariate
List<Covariate> optionalCovariates = new ArrayList<Covariate>(); final List<Covariate> optionalCovariates = new ArrayList<Covariate>();
optionalCovariates.add(keyManager.getRequiredCovariates().get(1)); // include the quality score covariate as an optional covariate optionalCovariates.add(keyManager.getRequiredCovariates()[1]); // include the quality score covariate as an optional covariate
optionalCovariates.addAll(keyManager.getOptionalCovariates()); // include all optional covariates optionalCovariates.addAll(Arrays.asList(keyManager.getOptionalCovariates())); // include all optional covariates
deltaKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initialize the key manager deltaKeyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initialize the key manager
} }
} }
@ -376,37 +376,37 @@ public class RecalDataManager {
throw new ReviewedStingException ("Couldn't find the covariates table"); throw new ReviewedStingException ("Couldn't find the covariates table");
boolean readyToPrint = false; boolean readyToPrint = false;
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> tableEntry : map.entrySet()) { for (Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> tableEntry : map.entrySet()) {
BQSRKeyManager keyManager = tableEntry.getKey(); final BQSRKeyManager keyManager = tableEntry.getKey();
if (keyManager.getRequiredCovariates().size() == 2 && keyManager.getOptionalCovariates().isEmpty()) { // look for the QualityScore table if (keyManager.getNumRequiredCovariates() == 2 && keyManager.getNumOptionalCovariates() == 0) { // look for the QualityScore table
Map<BitSet, RecalDatum> table = tableEntry.getValue(); final Map<Long, RecalDatum> table = tableEntry.getValue();
// add the quality score table to the delta table // add the quality score table to the delta table
for (Map.Entry<BitSet, RecalDatum> entry : table.entrySet()) { // go through every element in the covariates table to create the delta table for (final Map.Entry<Long, RecalDatum> entry : table.entrySet()) { // go through every element in the covariates table to create the delta table
RecalDatum recalDatum = entry.getValue(); // the current element (recal datum) final RecalDatum recalDatum = entry.getValue(); // the current element (recal datum)
List<Object> covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key final List<Object> covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key
List<Object> newCovs = new ArrayList<Object>(4); final List<Object> newCovs = new ArrayList<Object>(4);
newCovs.add(0, covs.get(0)); // replace the covariate value with the quality score newCovs.add(0, covs.get(0)); // replace the covariate value with the quality score
newCovs.add(1, covs.get(1)); newCovs.add(1, covs.get(1));
newCovs.add(2, "QualityScore"); // replace the covariate name with QualityScore (for the QualityScore covariate) newCovs.add(2, "QualityScore"); // replace the covariate name with QualityScore (for the QualityScore covariate)
newCovs.add(3, covs.get(2)); newCovs.add(3, covs.get(2));
BitSet deltaKey = deltaKeyManager.bitSetFromKey(newCovs.toArray()); // create a new bitset key for the delta table final long deltaKey = deltaKeyManager.longFromKey(newCovs.toArray()); // create a new bitset key for the delta table
addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table
} }
} }
else if (keyManager.getOptionalCovariates().size() > 0) { // look for the optional covariates table else if (keyManager.getNumOptionalCovariates() > 0) { // look for the optional covariates table
Map<BitSet, RecalDatum> table = tableEntry.getValue(); final Map<Long, RecalDatum> table = tableEntry.getValue();
// add the optional covariates to the delta table // add the optional covariates to the delta table
for (Map.Entry<BitSet, RecalDatum> entry : table.entrySet()) { // go through every element in the covariates table to create the delta table for (final Map.Entry<Long, RecalDatum> entry : table.entrySet()) { // go through every element in the covariates table to create the delta table
RecalDatum recalDatum = entry.getValue(); // the current element (recal datum) final RecalDatum recalDatum = entry.getValue(); // the current element (recal datum)
List<Object> covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key final List<Object> covs = keyManager.keySetFrom(entry.getKey()); // extract the key objects from the bitset key
covs.remove(QUALITY_SCORE_COVARIATE_INDEX); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) covs.remove(QUALITY_SCORE_COVARIATE_INDEX); // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS)
BitSet deltaKey = deltaKeyManager.bitSetFromKey(covs.toArray()); // create a new bitset key for the delta table final long deltaKey = deltaKeyManager.longFromKey(covs.toArray()); // create a new bitset key for the delta table
addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table addToDeltaTable(deltaTable, deltaKey, recalDatum); // add this covariate to the delta table
} }
readyToPrint = true; readyToPrint = true;
@ -416,7 +416,7 @@ public class RecalDataManager {
if (readyToPrint) { if (readyToPrint) {
if (printHeader) { if (printHeader) {
List<String> header = new LinkedList<String>(); final List<String> header = new LinkedList<String>();
header.add("ReadGroup"); header.add("ReadGroup");
header.add("CovariateValue"); header.add("CovariateValue");
header.add("CovariateName"); header.add("CovariateName");
@ -431,9 +431,9 @@ public class RecalDataManager {
} }
// print each data line // print each data line
for(Map.Entry<BitSet, RecalDatum> deltaEntry : deltaTable.entrySet()) { for (final Map.Entry<Long, RecalDatum> deltaEntry : deltaTable.entrySet()) {
List<Object> deltaKeys = deltaKeyManager.keySetFrom(deltaEntry.getKey()); final List<Object> deltaKeys = deltaKeyManager.keySetFrom(deltaEntry.getKey());
RecalDatum deltaDatum = deltaEntry.getValue(); final RecalDatum deltaDatum = deltaEntry.getValue();
deltaTableFile.print(Utils.join(",", deltaKeys)); deltaTableFile.print(Utils.join(",", deltaKeys));
deltaTableFile.print("," + deltaDatum.stringForCSV()); deltaTableFile.print("," + deltaDatum.stringForCSV());
deltaTableFile.println("," + recalibrationMode); deltaTableFile.println("," + recalibrationMode);
@ -453,8 +453,8 @@ public class RecalDataManager {
* @param deltaKey the key to the table * @param deltaKey the key to the table
* @param recalDatum the recal datum to combine with the accuracyDatum element in the table * @param recalDatum the recal datum to combine with the accuracyDatum element in the table
*/ */
private static void addToDeltaTable(Map<BitSet, RecalDatum> deltaTable, BitSet deltaKey, RecalDatum recalDatum) { private static void addToDeltaTable(Map<Long, RecalDatum> deltaTable, Long deltaKey, RecalDatum recalDatum) {
RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key final RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key
if (deltaDatum == null) if (deltaDatum == null)
deltaTable.put(deltaKey, new RecalDatum(recalDatum)); // if we don't have a key yet, create a new one with the same values as the curent datum deltaTable.put(deltaKey, new RecalDatum(recalDatum)); // if we don't have a key yet, create a new one with the same values as the curent datum
else else
@ -611,18 +611,32 @@ public class RecalDataManager {
* @param requestedCovariates The list of requested covariates. * @param requestedCovariates The list of requested covariates.
* @return a matrix with all the covariates calculated for every base in the read * @return a matrix with all the covariates calculated for every base in the read
*/ */
public static ReadCovariates computeCovariates(final GATKSAMRecord read, final List<Covariate> requestedCovariates) { public static ReadCovariates computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates) {
final int numRequestedCovariates = requestedCovariates.size(); final ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), requestedCovariates.length);
final int readLength = read.getReadLength(); computeCovariates(read, requestedCovariates, readCovariates);
final ReadCovariates readCovariates = new ReadCovariates(readLength, numRequestedCovariates);
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
for (Covariate covariate : requestedCovariates)
readCovariates.addCovariate(covariate.getValues(read));
return readCovariates; return readCovariates;
} }
/**
* Computes all requested covariates for every offset in the given read
* by calling covariate.getValues(..).
*
* It populates an array of covariate values where result[i][j] is the covariate
* value for the ith position in the read and the jth covariate in
* reqeustedCovariates list.
*
* @param read The read for which to compute covariate values.
* @param requestedCovariates The list of requested covariates.
* @param readCovariates The object to store the covariate values
*/
public static void computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates, final ReadCovariates readCovariates) {
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
for (int i = 0; i < requestedCovariates.length; i++) {
readCovariates.setCovariateIndex(i);
requestedCovariates[i].recordValues(read, readCovariates);
}
}
/** /**
* Perform a certain transversion (A <-> C or G <-> T) on the base. * Perform a certain transversion (A <-> C or G <-> T) on the base.
* *

View File

@ -114,16 +114,10 @@ public class RecalibrationArgumentCollection {
public int MISMATCHES_CONTEXT_SIZE = 2; public int MISMATCHES_CONTEXT_SIZE = 2;
/** /**
* The context covariate will use a context of this size to calculate it's covariate value for base insertions * The context covariate will use a context of this size to calculate it's covariate value for base insertions and deletions
*/ */
@Argument(fullName = "insertions_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions", required = false) @Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false)
public int INSERTIONS_CONTEXT_SIZE = 8; public int INDELS_CONTEXT_SIZE = 8;
/**
* The context covariate will use a context of this size to calculate it's covariate value for base deletions
*/
@Argument(fullName = "deletions_context_size", shortName = "dcs", doc = "size of the k-mer context to be used for base deletions", required = false)
public int DELETIONS_CONTEXT_SIZE = 8;
/** /**
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off) * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
@ -188,10 +182,8 @@ public class RecalibrationArgumentCollection {
argumentsTable.set("solid_nocall_strategy", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY); argumentsTable.set("solid_nocall_strategy", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY);
argumentsTable.addRowID("mismatches_context_size", true); argumentsTable.addRowID("mismatches_context_size", true);
argumentsTable.set("mismatches_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE); argumentsTable.set("mismatches_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE);
argumentsTable.addRowID("insertions_context_size", true); argumentsTable.addRowID("indels_context_size", true);
argumentsTable.set("insertions_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_CONTEXT_SIZE); argumentsTable.set("indels_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE);
argumentsTable.addRowID("deletions_context_size", true);
argumentsTable.set("deletions_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DELETIONS_CONTEXT_SIZE);
argumentsTable.addRowID("mismatches_default_quality", true); argumentsTable.addRowID("mismatches_default_quality", true);
argumentsTable.set("mismatches_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY); argumentsTable.set("mismatches_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY);
argumentsTable.addRowID("insertions_default_quality", true); argumentsTable.addRowID("insertions_default_quality", true);

View File

@ -18,8 +18,8 @@ import java.util.*;
*/ */
public class RecalibrationReport { public class RecalibrationReport {
private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done) private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done)
private final LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager private final LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation
private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes
private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter
@ -36,21 +36,25 @@ public class RecalibrationReport {
Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalDataManager.initializeCovariates(RAC); // initialize the required and optional covariates Pair<ArrayList<Covariate>, ArrayList<Covariate>> covariates = RecalDataManager.initializeCovariates(RAC); // initialize the required and optional covariates
ArrayList<Covariate> requiredCovariates = covariates.getFirst(); ArrayList<Covariate> requiredCovariates = covariates.getFirst();
ArrayList<Covariate> optionalCovariates = covariates.getSecond(); ArrayList<Covariate> optionalCovariates = covariates.getSecond();
requestedCovariates.addAll(requiredCovariates); // add all required covariates to the list of requested covariates requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()];
requestedCovariates.addAll(optionalCovariates); // add all optional covariates to the list of requested covariates int covariateIndex = 0;
for (final Covariate covariate : requiredCovariates)
requestedCovariates[covariateIndex++] = covariate;
for (final Covariate covariate : optionalCovariates)
requestedCovariates[covariateIndex++] = covariate;
for (Covariate cov : requestedCovariates) for (Covariate cov : requestedCovariates)
cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>>(); keysAndTablesMap = new LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>>();
ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates. ArrayList<Covariate> requiredCovariatesToAdd = new ArrayList<Covariate>(requiredCovariates.size()); // incrementally add the covariates to create the recal tables with 1, 2 and 3 covariates.
ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables ArrayList<Covariate> optionalCovariatesToAdd = new ArrayList<Covariate>(); // initialize an empty array of optional covariates to create the first few tables
for (Covariate covariate : requiredCovariates) { for (Covariate covariate : requiredCovariates) {
requiredCovariatesToAdd.add(covariate); requiredCovariatesToAdd.add(covariate);
final Map<BitSet, RecalDatum> table; // initializing a new recal table for each required covariate (cumulatively) final Map<Long, RecalDatum> table; // initializing a new recal table for each required covariate (cumulatively)
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariatesToAdd, optionalCovariatesToAdd); // initializing it's corresponding key manager
int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES) final int nRequiredCovariates = requiredCovariatesToAdd.size(); // the number of required covariates defines which table we are looking at (RG, QUAL or ALL_COVARIATES)
final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check."; final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check.";
if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table if (nRequiredCovariates == 1) { // if there is only one required covariate, this is the read group table
final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE); final GATKReportTable reportTable = report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE);
@ -69,15 +73,16 @@ public class RecalibrationReport {
final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager final BQSRKeyManager keyManager = new BQSRKeyManager(requiredCovariates, optionalCovariates); // initializing it's corresponding key manager
final GATKReportTable reportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE); final GATKReportTable reportTable = report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE);
final Map<BitSet, RecalDatum> table = parseAllCovariatesTable(keyManager, reportTable); final Map<Long, RecalDatum> table = parseAllCovariatesTable(keyManager, reportTable);
keysAndTablesMap.put(keyManager, table); keysAndTablesMap.put(keyManager, table);
} }
protected RecalibrationReport(QuantizationInfo quantizationInfo, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, GATKReportTable argumentTable, RecalibrationArgumentCollection RAC) { protected RecalibrationReport(final QuantizationInfo quantizationInfo, final LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> keysAndTablesMap, final GATKReportTable argumentTable, final RecalibrationArgumentCollection RAC) {
this.quantizationInfo = quantizationInfo; this.quantizationInfo = quantizationInfo;
this.keysAndTablesMap = keysAndTablesMap; this.keysAndTablesMap = keysAndTablesMap;
this.argumentTable = argumentTable; this.argumentTable = argumentTable;
this.RAC = RAC; this.RAC = RAC;
this.requestedCovariates = null;
} }
/** /**
@ -94,25 +99,25 @@ public class RecalibrationReport {
* @param other the recalibration report to combine with this one * @param other the recalibration report to combine with this one
*/ */
public void combine(RecalibrationReport other) { public void combine(RecalibrationReport other) {
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> thisIterator = keysAndTablesMap.entrySet().iterator(); Iterator<Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>>> thisIterator = keysAndTablesMap.entrySet().iterator();
for (Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> otherEntry : other.getKeysAndTablesMap().entrySet()) { for (Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> otherEntry : other.getKeysAndTablesMap().entrySet()) {
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> thisEntry = thisIterator.next(); Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> thisEntry = thisIterator.next();
Map<BitSet, RecalDatum> thisTable = thisEntry.getValue(); final Map<Long, RecalDatum> thisTable = thisEntry.getValue();
BQSRKeyManager thisKeyManager = thisEntry.getKey(); final BQSRKeyManager thisKeyManager = thisEntry.getKey();
BQSRKeyManager otherKeyManager = otherEntry.getKey(); final BQSRKeyManager otherKeyManager = otherEntry.getKey();
for (Map.Entry<BitSet, RecalDatum> otherTableEntry : otherEntry.getValue().entrySet()) { for (Map.Entry<Long, RecalDatum> otherTableEntry : otherEntry.getValue().entrySet()) {
RecalDatum otherDatum = otherTableEntry.getValue(); final RecalDatum otherDatum = otherTableEntry.getValue();
BitSet otherBitKey = otherTableEntry.getKey(); final Long otherBitKey = otherTableEntry.getKey();
List<Object> otherObjectKey = otherKeyManager.keySetFrom(otherBitKey); final List<Object> otherObjectKey = otherKeyManager.keySetFrom(otherBitKey);
BitSet thisBitKey = thisKeyManager.bitSetFromKey(otherObjectKey.toArray()); final long thisKey = thisKeyManager.longFromKey(otherObjectKey.toArray());
RecalDatum thisDatum = thisTable.get(thisBitKey); final RecalDatum thisDatum = thisTable.get(thisKey);
if (thisDatum == null) if (thisDatum == null)
thisTable.put(thisBitKey, otherDatum); thisTable.put(thisKey, otherDatum);
else else
thisDatum.combine(otherDatum); thisDatum.combine(otherDatum);
} }
@ -123,11 +128,11 @@ public class RecalibrationReport {
return quantizationInfo; return quantizationInfo;
} }
public LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> getKeysAndTablesMap() { public LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> getKeysAndTablesMap() {
return keysAndTablesMap; return keysAndTablesMap;
} }
public ArrayList<Covariate> getRequestedCovariates() { public Covariate[] getRequestedCovariates() {
return requestedCovariates; return requestedCovariates;
} }
@ -138,7 +143,7 @@ public class RecalibrationReport {
* @param reportTable the GATKReport table containing data for this table * @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/ */
private Map<BitSet, RecalDatum> parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) { private Map<Long, RecalDatum> parseAllCovariatesTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(5); ArrayList<String> columnNamesOrderedList = new ArrayList<String>(5);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME); columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME); columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
@ -155,7 +160,7 @@ public class RecalibrationReport {
* @param reportTable the GATKReport table containing data for this table * @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/ */
private Map<BitSet, RecalDatum> parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) { private Map<Long, RecalDatum> parseQualityScoreTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(3); ArrayList<String> columnNamesOrderedList = new ArrayList<String>(3);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME); columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME); columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
@ -170,7 +175,7 @@ public class RecalibrationReport {
* @param reportTable the GATKReport table containing data for this table * @param reportTable the GATKReport table containing data for this table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/ */
private Map<BitSet, RecalDatum> parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) { private Map<Long, RecalDatum> parseReadGroupTable(BQSRKeyManager keyManager, GATKReportTable reportTable) {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(2); ArrayList<String> columnNamesOrderedList = new ArrayList<String>(2);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME); columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME); columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
@ -185,26 +190,26 @@ public class RecalibrationReport {
* @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table * @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key. * @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/ */
private Map<BitSet, RecalDatum> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList, boolean hasEstimatedQReportedColumn) { private Map<Long, RecalDatum> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList, boolean hasEstimatedQReportedColumn) {
Map<BitSet, RecalDatum> result = new HashMap<BitSet, RecalDatum>(reportTable.getNumRows()*2); final Map<Long, RecalDatum> result = new HashMap<Long, RecalDatum>(reportTable.getNumRows()*2);
for ( int i = 0; i < reportTable.getNumRows(); i++ ) { for ( int i = 0; i < reportTable.getNumRows(); i++ ) {
int nKeys = columnNamesOrderedList.size(); final int nKeys = columnNamesOrderedList.size();
Object [] keySet = new Object[nKeys]; final Object [] keySet = new Object[nKeys];
for (int j = 0; j < nKeys; j++) for (int j = 0; j < nKeys; j++)
keySet[j] = reportTable.get(i, columnNamesOrderedList.get(j)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below) keySet[j] = reportTable.get(i, columnNamesOrderedList.get(j)); // all these objects are okay in String format, the key manager will handle them correctly (except for the event type (see below)
keySet[keySet.length-1] = EventType.eventFrom((String) keySet[keySet.length-1]); // the last key is always the event type. We convert the string ("M", "I" or "D") to an enum object (necessary for the key manager). keySet[keySet.length-1] = EventType.eventFrom((String) keySet[keySet.length-1]); // the last key is always the event type. We convert the string ("M", "I" or "D") to an enum object (necessary for the key manager).
BitSet bitKey = keyManager.bitSetFromKey(keySet); final long bitKey = keyManager.longFromKey(keySet);
long nObservations = (Long) reportTable.get(i, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME); final long nObservations = (Long) reportTable.get(i, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
long nErrors = (Long) reportTable.get(i, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME); final long nErrors = (Long) reportTable.get(i, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
double empiricalQuality = (Double) reportTable.get(i, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME); final double empiricalQuality = (Double) reportTable.get(i, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME);
double estimatedQReported = hasEstimatedQReportedColumn ? // the estimatedQreported column only exists in the ReadGroup table final double estimatedQReported = hasEstimatedQReportedColumn ? // the estimatedQreported column only exists in the ReadGroup table
(Double) reportTable.get(i, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table (Double) reportTable.get(i, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table
Byte.parseByte((String) reportTable.get(i, RecalDataManager.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table Byte.parseByte((String) reportTable.get(i, RecalDataManager.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table
RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality); final RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
result.put(bitKey, recalDatum); result.put(bitKey, recalDatum);
} }
return result; return result;
@ -217,14 +222,14 @@ public class RecalibrationReport {
* @return an ArrayList with the quantization mappings from 0 to MAX_QUAL_SCORE * @return an ArrayList with the quantization mappings from 0 to MAX_QUAL_SCORE
*/ */
private QuantizationInfo initializeQuantizationTable(GATKReportTable table) { private QuantizationInfo initializeQuantizationTable(GATKReportTable table) {
Byte[] quals = new Byte[QualityUtils.MAX_QUAL_SCORE + 1]; final Byte[] quals = new Byte[QualityUtils.MAX_QUAL_SCORE + 1];
Long[] counts = new Long[QualityUtils.MAX_QUAL_SCORE + 1]; final Long[] counts = new Long[QualityUtils.MAX_QUAL_SCORE + 1];
for ( int i = 0; i < table.getNumRows(); i++ ) { for ( int i = 0; i < table.getNumRows(); i++ ) {
byte originalQual = (byte)i; final byte originalQual = (byte)i;
Object quantizedObject = table.get(i, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME); final Object quantizedObject = table.get(i, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME);
Object countObject = table.get(i, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME); final Object countObject = table.get(i, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME);
byte quantizedQual = Byte.parseByte(quantizedObject.toString()); final byte quantizedQual = Byte.parseByte(quantizedObject.toString());
long quantizedCount = Long.parseLong(countObject.toString()); final long quantizedCount = Long.parseLong(countObject.toString());
quals[originalQual] = quantizedQual; quals[originalQual] = quantizedQual;
counts[originalQual] = quantizedCount; counts[originalQual] = quantizedCount;
} }
@ -238,7 +243,7 @@ public class RecalibrationReport {
* @return a RAC object properly initialized with all the objects in the table * @return a RAC object properly initialized with all the objects in the table
*/ */
private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) { private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) {
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
for ( int i = 0; i < table.getNumRows(); i++ ) { for ( int i = 0; i < table.getNumRows(); i++ ) {
final String argument = table.get(i, "Argument").toString(); final String argument = table.get(i, "Argument").toString();
@ -261,11 +266,8 @@ public class RecalibrationReport {
else if (argument.equals("mismatches_context_size")) else if (argument.equals("mismatches_context_size"))
RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value); RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (argument.equals("insertions_context_size")) else if (argument.equals("indels_context_size"))
RAC.INSERTIONS_CONTEXT_SIZE = Integer.parseInt((String) value); RAC.INDELS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (argument.equals("deletions_context_size"))
RAC.DELETIONS_CONTEXT_SIZE = Integer.parseInt((String) value);
else if (argument.equals("mismatches_default_quality")) else if (argument.equals("mismatches_default_quality"))
RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value); RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value);
@ -306,7 +308,7 @@ public class RecalibrationReport {
* and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer. * and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer.
*/ */
public void calculateEmpiricalAndQuantizedQualities() { public void calculateEmpiricalAndQuantizedQualities() {
for (Map<BitSet, RecalDatum> table : keysAndTablesMap.values()) for (Map<Long, RecalDatum> table : keysAndTablesMap.values())
for (RecalDatum datum : table.values()) for (RecalDatum datum : table.values())
datum.calcCombinedEmpiricalQuality(); datum.calcCombinedEmpiricalQuality();
@ -331,26 +333,26 @@ public class RecalibrationReport {
return isEqualTable(this.keysAndTablesMap, other.keysAndTablesMap); return isEqualTable(this.keysAndTablesMap, other.keysAndTablesMap);
} }
private boolean isEqualTable(LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> t1, LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> t2) { private boolean isEqualTable(LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> t1, LinkedHashMap<BQSRKeyManager, Map<Long, RecalDatum>> t2) {
if (t1.size() != t2.size()) if (t1.size() != t2.size())
return false; return false;
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> t1Iterator = t1.entrySet().iterator(); final Iterator<Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>>> t1Iterator = t1.entrySet().iterator();
Iterator<Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>>> t2Iterator = t2.entrySet().iterator(); final Iterator<Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>>> t2Iterator = t2.entrySet().iterator();
while (t1Iterator.hasNext() && t2Iterator.hasNext()) { while (t1Iterator.hasNext() && t2Iterator.hasNext()) {
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> t1MapEntry = t1Iterator.next(); Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> t1MapEntry = t1Iterator.next();
Map.Entry<BQSRKeyManager, Map<BitSet, RecalDatum>> t2MapEntry = t2Iterator.next(); Map.Entry<BQSRKeyManager, Map<Long, RecalDatum>> t2MapEntry = t2Iterator.next();
if (!(t1MapEntry.getKey().equals(t2MapEntry.getKey()))) if (!(t1MapEntry.getKey().equals(t2MapEntry.getKey())))
return false; return false;
Map<BitSet, RecalDatum> table2 = t2MapEntry.getValue(); final Map<Long, RecalDatum> table2 = t2MapEntry.getValue();
for (Map.Entry<BitSet, RecalDatum> t1TableEntry : t1MapEntry.getValue().entrySet()) { for (Map.Entry<Long, RecalDatum> t1TableEntry : t1MapEntry.getValue().entrySet()) {
BitSet t1Key = t1TableEntry.getKey(); final Long t1Key = t1TableEntry.getKey();
if (!table2.containsKey(t1Key)) if (!table2.containsKey(t1Key))
return false; return false;
RecalDatum t1Datum = t1TableEntry.getValue(); final RecalDatum t1Datum = t1TableEntry.getValue();
if (!t1Datum.equals(table2.get(t1Key))) if (!t1Datum.equals(table2.get(t1Key)))
return false; return false;
} }

View File

@ -33,12 +33,10 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.*;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import java.util.*; import java.util.*;
@ -147,7 +145,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
intervalListIterator = new PeekableIterator<GenomeLoc>(getToolkit().getIntervals().iterator()); intervalListIterator = new PeekableIterator<GenomeLoc>(getToolkit().getIntervals().iterator());
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header
vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // initialize the VCF header vcfWriter.writeHeader(new VCFHeader(ThresHolder.getHeaderInfo(), samples)); // initialize the VCF header
} }
@Override @Override
@ -249,6 +247,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
private void outputStatsToVCF(IntervalStatistics stats, Allele refAllele) { private void outputStatsToVCF(IntervalStatistics stats, Allele refAllele) {
GenomeLoc interval = stats.getInterval(); GenomeLoc interval = stats.getInterval();
List<Allele> alleles = new ArrayList<Allele>(); List<Allele> alleles = new ArrayList<Allele>();
Map<String, Object> attributes = new HashMap<String, Object>(); Map<String, Object> attributes = new HashMap<String, Object>();
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(); ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
@ -258,73 +257,46 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles); VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles);
vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF
vcb.filters(statusesToStrings(stats.callableStatuses(thresholds))); vcb.filters(new HashSet<String>(statusesToStrings(stats.callableStatuses(thresholds))));
attributes.put(VCFConstants.END_KEY, interval.getStop()); attributes.put(VCFConstants.END_KEY, interval.getStop());
attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage()); attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage());
vcb = vcb.attributes(attributes); vcb = vcb.attributes(attributes);
for (String sample : samples) {
Map<String, Object> infos = new HashMap<String, Object>();
SampleStatistics sampleStat = stats.getSample(sample);
infos.put(VCFConstants.DEPTH_KEY, sampleStat.averageCoverage());
infos.put("Q1", sampleStat.getQuantileDepth(0.25));
infos.put("MED", sampleStat.getQuantileDepth(0.50));
infos.put("Q3", sampleStat.getQuantileDepth(0.75));
Set<String> filters = new HashSet<String>();
filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds)));
genotypes.add(new Genotype(sample, null, VariantContext.NO_LOG10_PERROR, filters, infos, false));
}
vcb = vcb.genotypes(genotypes);
if (debug) { if (debug) {
System.out.printf("Output -- Interval: %s, Coverage: %.2f%n", stats.getInterval(), stats.averageCoverage()); System.out.printf("Output -- Interval: %s, Coverage: %.2f%n", stats.getInterval(), stats.averageCoverage());
} }
for (String sample : samples) {
final GenotypeBuilder gb = new GenotypeBuilder(sample);
SampleStatistics sampleStat = stats.getSample(sample);
gb.DP((int)sampleStat.averageCoverage());
gb.attribute("Q1", sampleStat.getQuantileDepth(0.25));
gb.attribute("MED", sampleStat.getQuantileDepth(0.50));
gb.attribute("Q3", sampleStat.getQuantileDepth(0.75));
if (debug) {
System.out.printf("Found %d bad mates out of %d reads %n", sampleStat.getnBadMates(), sampleStat.getnReads());
}
gb.filters(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds)));
genotypes.add(gb.make());
}
vcb = vcb.genotypes(genotypes);
vcfWriter.add(vcb.make()); vcfWriter.add(vcb.make());
} }
/**
* Gets the header lines for the VCF writer
*
* @return A set of VCF header lines
*/
private static Set<VCFHeaderLine> getHeaderInfo() {
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
// INFO fields for overall data
headerLines.add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval"));
headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size."));
headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));
// FORMAT fields for each genotype
// todo -- find the appropriate VCF constants
headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size."));
headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution."));
headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution."));
headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution."));
// FILTER fields
for (CallableStatus stat : CallableStatus.values())
headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description));
return headerLines;
}
/** /**
* Function that process a set of statuses into strings * Function that process a set of statuses into strings
* *
* @param statuses the set of statuses to be converted * @param statuses the set of statuses to be converted
* @return a matching set of strings * @return a matching set of strings
*/ */
private Set<String> statusesToStrings(Set<CallableStatus> statuses) { private List<String> statusesToStrings(Set<CallableStatus> statuses) {
Set<String> output = new HashSet<String>(statuses.size()); List<String> output = new ArrayList<String>(statuses.size());
for (CallableStatus status : statuses) for (CallableStatus status : statuses)
output.add(status.name()); output.add(status.name());
@ -333,6 +305,6 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
} }
private IntervalStatistics createIntervalStatistic(GenomeLoc interval) { private IntervalStatistics createIntervalStatistic(GenomeLoc interval) {
return new IntervalStatistics(samples, interval /*, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality*/); return new IntervalStatistics(samples, interval);
} }
} }

View File

@ -0,0 +1,84 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.PartitionBy;
import org.broadinstitute.sting.gatk.walkers.PartitionType;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import java.io.PrintStream;
@PartitionBy(PartitionType.CONTIG)
@ActiveRegionExtension(extension = 0, maxRegion = 50000)
public class FindCoveredIntervals extends ActiveRegionWalker<GenomeLoc, Long> {
@Output(required = true)
private PrintStream out;
@Override
// Look to see if the region has sufficient coverage
public double isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup());
// note the linear probability scale
int coverageThreshold = 20;
return Math.min((double) depth / coverageThreshold, 1);
}
@Override
public GenomeLoc map(final ActiveRegion activeRegion, final RefMetaDataTracker tracker) {
if (activeRegion.isActive)
return activeRegion.getLocation();
else
return null;
}
@Override
public Long reduceInit() {
return 0L;
}
@Override
public Long reduce(final GenomeLoc value, Long reduce) {
if (value != null) {
out.println(value.toString());
return reduce++;
} else
return reduce;
}
@Override
public void onTraversalDone(Long reduce) {
logger.info(String.format("Found %d intervals", reduce));
}
}

View File

@ -79,14 +79,12 @@ class SampleStatistics {
* @return the callable statuses of the entire sample * @return the callable statuses of the entire sample
*/ */
public Set<CallableStatus> getCallableStatuses(ThresHolder thresholds) { public Set<CallableStatus> getCallableStatuses(ThresHolder thresholds) {
Set<CallableStatus> output = new HashSet<CallableStatus>();
// We check if reads are present ot prevent div / 0 exceptions // We check if reads are present ot prevent div / 0 exceptions
if (nReads == 0) { if (nReads == 0) {
output.add(CallableStatus.NO_READS); return Collections.singleton(CallableStatus.NO_READS);
return output;
} }
Set<CallableStatus> output = new HashSet<CallableStatus>();
Map<CallableStatus, Double> totals = new HashMap<CallableStatus, Double>(CallableStatus.values().length); Map<CallableStatus, Double> totals = new HashMap<CallableStatus, Double>(CallableStatus.values().length);
// initialize map // initialize map
@ -104,19 +102,19 @@ class SampleStatistics {
double intervalSize = interval.size(); double intervalSize = interval.size();
if ((nBadMates / nReads) > thresholds.getBadMateStatusThreshold()) if (((double) nBadMates / nReads) >= thresholds.getBadMateStatusThreshold())
output.add(CallableStatus.BAD_MATE); output.add(CallableStatus.BAD_MATE);
if ((totals.get(CallableStatus.COVERAGE_GAPS) / intervalSize) > thresholds.getCoverageStatusThreshold()) if ((totals.get(CallableStatus.COVERAGE_GAPS) / intervalSize) >= thresholds.getCoverageStatusThreshold())
output.add(CallableStatus.COVERAGE_GAPS); output.add(CallableStatus.COVERAGE_GAPS);
if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > thresholds.getCoverageStatusThreshold()) if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) >= thresholds.getCoverageStatusThreshold())
output.add(CallableStatus.LOW_COVERAGE); output.add(CallableStatus.LOW_COVERAGE);
if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > thresholds.getExcessiveCoverageThreshold()) if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) >= thresholds.getExcessiveCoverageThreshold())
output.add(CallableStatus.EXCESSIVE_COVERAGE); output.add(CallableStatus.EXCESSIVE_COVERAGE);
if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > thresholds.getQualityStatusThreshold()) if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) >= thresholds.getQualityStatusThreshold())
output.add(CallableStatus.POOR_QUALITY); output.add(CallableStatus.POOR_QUALITY);
if (totals.get(CallableStatus.REF_N) > 0) if (totals.get(CallableStatus.REF_N) > 0)
@ -126,6 +124,7 @@ class SampleStatistics {
if (output.isEmpty()) { if (output.isEmpty()) {
output.add(CallableStatus.PASS); output.add(CallableStatus.PASS);
} }
return output; return output;
} }
@ -146,7 +145,7 @@ class SampleStatistics {
int locusIndex = locus.getStart() - interval.getStart(); int locusIndex = locus.getStart() - interval.getStart();
int rawCoverage = pileup.depthOfCoverage(); int rawCoverage = pileup.depthOfCoverage();
int coverage = pileup.getBaseAndMappingFilteredPileup(thresholds.getMinimumBaseQuality(), thresholds.getMinimumMappingQuality()).depthOfCoverage(); int coverage = thresholds.getFilteredCoverage(pileup);
LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage); LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage);
@ -161,7 +160,7 @@ class SampleStatistics {
// Was this read already processed? // Was this read already processed?
if (read.getTemporaryAttribute("checkedBadMate") == null) { if (read.getTemporaryAttribute("checkedBadMate") == null) {
nReads++; nReads++;
if (hasValidMate(read, thresholds)) if (!hasValidMate(read, thresholds))
nBadMates++; nBadMates++;
read.setTemporaryAttribute("checkedBadMate", true); read.setTemporaryAttribute("checkedBadMate", true);
} }
@ -254,7 +253,7 @@ class SampleStatistics {
* reasonable insert size? * reasonable insert size?
* inverted? * inverted?
* same orientation? * same orientation?
* todo - same contig? * same contig?
* is pair mapped? * is pair mapped?
* todo - is forced mate? * todo - is forced mate?
* *
@ -264,6 +263,10 @@ class SampleStatistics {
if (!read.getReadPairedFlag()) if (!read.getReadPairedFlag())
return false; return false;
// different contigs
if (read.getMateReferenceIndex() != read.getReferenceIndex())
return false;
// unmapped // unmapped
if (read.getMateUnmappedFlag() || read.getReadUnmappedFlag()) if (read.getMateUnmappedFlag() || read.getReadUnmappedFlag())
return false; return false;
@ -277,10 +280,19 @@ class SampleStatistics {
read.getAlignmentStart() < read.getMateAlignmentStart()) read.getAlignmentStart() < read.getMateAlignmentStart())
return false; return false;
// TODO note: IGV uses a different alorithm for insert size, there should be a common util class that does this for you
// mates are too far apart // mates are too far apart
if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) > thresholds.getMaximumInsertSize()) if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) > thresholds.getMaximumInsertSize())
return false; return false;
return true; return true;
} }
public int getnReads() {
return nReads;
}
public int getnBadMates() {
return nBadMates;
}
} }

View File

@ -24,6 +24,12 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.HashSet;
import java.util.Set;
class ThresHolder { class ThresHolder {
public static final ThresHolder DEFAULTS = new ThresHolder(20, 20, 5, 700, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5); public static final ThresHolder DEFAULTS = new ThresHolder(20, 20, 5, 700, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5);
@ -69,14 +75,6 @@ class ThresHolder {
this.qualityStatusThreshold = qualityStatusThreshold; this.qualityStatusThreshold = qualityStatusThreshold;
} }
public int getMinimumBaseQuality() {
return minimumBaseQuality;
}
public int getMinimumMappingQuality() {
return minimumMappingQuality;
}
public int getMinimumCoverage() { public int getMinimumCoverage() {
return minimumCoverage; return minimumCoverage;
} }
@ -116,4 +114,37 @@ class ThresHolder {
public double getQualityStatusThreshold() { public double getQualityStatusThreshold() {
return qualityStatusThreshold; return qualityStatusThreshold;
} }
public int getFilteredCoverage(ReadBackedPileup pileup) {
return pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage();
}
/**
* Gets the header lines for the VCF writer
*
* @return A set of VCF header lines
*/
public static Set<VCFHeaderLine> getHeaderInfo() {
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
// INFO fields for overall data
headerLines.add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval"));
headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size."));
headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));
// FORMAT fields for each genotype
// todo -- find the appropriate VCF constants
headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size."));
headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution."));
headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution."));
headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution."));
// FILTER fields
for (CallableStatus stat : CallableStatus.values())
headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description));
return headerLines;
}
} }

View File

@ -55,8 +55,6 @@ public class BAMDiffableReader implements DiffableReader {
int count = 0; int count = 0;
while ( iterator.hasNext() ) { while ( iterator.hasNext() ) {
if ( count++ > maxElementsToRead && maxElementsToRead != -1)
break;
final SAMRecord record = iterator.next(); final SAMRecord record = iterator.next();
// name is the read name + first of pair // name is the read name + first of pair
@ -88,6 +86,9 @@ public class BAMDiffableReader implements DiffableReader {
if ( ! root.hasElement(name) ) if ( ! root.hasElement(name) )
// protect ourselves from malformed files // protect ourselves from malformed files
root.add(readRoot); root.add(readRoot);
count += readRoot.size();
if ( count > maxElementsToRead && maxElementsToRead != -1)
break;
} }
reader.close(); reader.close();

View File

@ -147,7 +147,7 @@ public class DiffEngine {
* @param diffs the list of differences to summarize * @param diffs the list of differences to summarize
*/ */
public void reportSummarizedDifferences(List<Difference> diffs, SummaryReportParams params ) { public void reportSummarizedDifferences(List<Difference> diffs, SummaryReportParams params ) {
printSummaryReport(summarizedDifferencesOfPaths(diffs, params.maxRawDiffsToSummarize), params ); printSummaryReport(summarizedDifferencesOfPaths(diffs, params.doPairwise, params.maxRawDiffsToSummarize), params );
} }
final protected static String[] diffNameToPath(String diffName) { final protected static String[] diffNameToPath(String diffName) {
@ -161,9 +161,17 @@ public class DiffEngine {
diffs.add(new Difference(diff)); diffs.add(new Difference(diff));
} }
return summarizedDifferencesOfPaths(diffs, -1); return summarizedDifferencesOfPaths(diffs, true, -1);
} }
/**
* Computes a minimum set of potential differences between all singleton differences
* in singletonDiffs. Employs an expensive pairwise O(n^2) algorithm.
*
* @param singletonDiffs
* @param maxRawDiffsToSummarize
* @return
*/
private Map<String, Difference> initialPairwiseSummaries(final List<? extends Difference> singletonDiffs, private Map<String, Difference> initialPairwiseSummaries(final List<? extends Difference> singletonDiffs,
final int maxRawDiffsToSummarize) { final int maxRawDiffsToSummarize) {
Map<String, Difference> summaries = new HashMap<String, Difference>(); Map<String, Difference> summaries = new HashMap<String, Difference>();
@ -191,9 +199,41 @@ public class DiffEngine {
return summaries; return summaries;
} }
/**
* Computes the possible leaf differences among the singleton diffs.
*
* The leaf differences are all of the form *.*...*.X where all internal
* differences are wildcards and the only summarized difference considered
* interesting to compute is
*
* @param singletonDiffs
* @param maxRawDiffsToSummarize
* @return
*/
private Map<String, Difference> initialLeafSummaries(final List<? extends Difference> singletonDiffs,
final int maxRawDiffsToSummarize) {
Map<String, Difference> summaries = new HashMap<String, Difference>();
// create the initial set of differences
for ( final Difference d : singletonDiffs ) {
final String path = summarizedPath(d.getParts(), 1);
Difference sumDiff = new Difference(path, d.getMaster(), d.getTest());
sumDiff.setCount(0);
addSummaryIfMissing(summaries, sumDiff);
if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize)
return summaries;
}
return summaries;
}
protected List<Difference> summarizedDifferencesOfPaths(final List<? extends Difference> singletonDiffs, protected List<Difference> summarizedDifferencesOfPaths(final List<? extends Difference> singletonDiffs,
final boolean doPairwise,
final int maxRawDiffsToSummarize) { final int maxRawDiffsToSummarize) {
Map<String, Difference> summaries = initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize); final Map<String, Difference> summaries = doPairwise
? initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize)
: initialLeafSummaries(singletonDiffs, maxRawDiffsToSummarize);
// count differences // count differences
for ( Difference diffPath : singletonDiffs ) { for ( Difference diffPath : singletonDiffs ) {
@ -372,18 +412,21 @@ public class DiffEngine {
final int maxCountOneItems; final int maxCountOneItems;
final int minSumDiffToShow; final int minSumDiffToShow;
final int maxRawDiffsToSummarize; final int maxRawDiffsToSummarize;
final boolean doPairwise;
boolean descending = true; boolean descending = true;
public SummaryReportParams(PrintStream out, public SummaryReportParams(PrintStream out,
int maxItemsToDisplay, int maxItemsToDisplay,
int maxCountOneItems, int maxCountOneItems,
int minSumDiffToShow, int minSumDiffToShow,
int maxRawDiffsToSummarize) { int maxRawDiffsToSummarize,
final boolean doPairwise) {
this.out = out; this.out = out;
this.maxItemsToDisplay = maxItemsToDisplay; this.maxItemsToDisplay = maxItemsToDisplay;
this.maxCountOneItems = maxCountOneItems; this.maxCountOneItems = maxCountOneItems;
this.minSumDiffToShow = minSumDiffToShow; this.minSumDiffToShow = minSumDiffToShow;
this.maxRawDiffsToSummarize = maxRawDiffsToSummarize; this.maxRawDiffsToSummarize = maxRawDiffsToSummarize;
this.doPairwise = doPairwise;
} }
public void setDescending(boolean descending) { public void setDescending(boolean descending) {

View File

@ -111,21 +111,21 @@ import java.util.List;
* <p> * <p>
* *
* <pre> * <pre>
[testng] path count [testng] path count
[testng] *.*.*.AC 6 [testng] *.*.*.AC 6
[testng] *.*.*.AF 6 [testng] *.*.*.AF 6
[testng] *.*.*.AN 6 [testng] *.*.*.AN 6
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AC 1 [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AF 1 [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AN 1 [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AC 1 [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AF 1 [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AN 1 [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AC 1 [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AF 1 [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AN 1 [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC 1 [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC 1
</pre> </pre>
* *
* @author Mark DePristo * @author Mark DePristo
* @since 7/4/11 * @since 7/4/11
@ -165,6 +165,8 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="maxRawDiffsToSummary", shortName="maxRawDiffsToSummary", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) @Argument(fullName="maxRawDiffsToSummary", shortName="maxRawDiffsToSummary", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false)
int maxRawDiffsToSummary = -1; int maxRawDiffsToSummary = -1;
@Argument(fullName="doPairwise", shortName="doPairwise", doc="If provided, we will compute the minimum pairwise differences to summary, which can be extremely expensive", required=false)
boolean doPairwise = false;
/** /**
* The max number of differences to display when summarizing. For example, if there are 10M differences, but * The max number of differences to display when summarizing. For example, if there are 10M differences, but
@ -199,11 +201,14 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="showItemizedDifferences", shortName="SID", doc="Should we enumerate all differences between the files?", required=false) @Argument(fullName="showItemizedDifferences", shortName="SID", doc="Should we enumerate all differences between the files?", required=false)
boolean showItemizedDifferences = false; boolean showItemizedDifferences = false;
@Argument(fullName="iterations", doc="Number of iterations to perform, should be 1 unless you are doing memory testing", required=false)
int iterations = 1;
DiffEngine diffEngine; DiffEngine diffEngine;
@Override @Override
public void initialize() { public void initialize() {
this.diffEngine = new DiffEngine(); this.diffEngine = new DiffEngine();
} }
@Override @Override
@ -223,29 +228,39 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
@Override @Override
public void onTraversalDone(Integer sum) { public void onTraversalDone(Integer sum) {
//out.printf("Reading master file %s%n", masterFile); if ( iterations > 1 ) {
DiffElement master = diffEngine.createDiffableFromFile(masterFile, MAX_OBJECTS_TO_READ); for ( int i = 0; i < iterations; i++ ) {
logger.info(String.format("Read %d objects", master.size())); DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, 20, 10, 0, -1, false);
//out.printf("Reading test file %s%n", testFile); boolean success = DiffEngine.simpleDiffFiles(masterFile, testFile, MAX_OBJECTS_TO_READ, params);
DiffElement test = diffEngine.createDiffableFromFile(testFile, MAX_OBJECTS_TO_READ); logger.info("Iteration " + i + " success " + success);
logger.info(String.format("Read %d objects", test.size())); }
} else {
//out.printf("Reading master file %s%n", masterFile);
DiffElement master = diffEngine.createDiffableFromFile(masterFile, MAX_OBJECTS_TO_READ);
logger.info(String.format("Read %d objects", master.size()));
//out.printf("Reading test file %s%n", testFile);
DiffElement test = diffEngine.createDiffableFromFile(testFile, MAX_OBJECTS_TO_READ);
logger.info(String.format("Read %d objects", test.size()));
// out.printf("Master diff objects%n"); // out.printf("Master diff objects%n");
// out.println(master.toString()); // out.println(master.toString());
// out.printf("Test diff objects%n"); // out.printf("Test diff objects%n");
// out.println(test.toString()); // out.println(test.toString());
List<Difference> diffs = diffEngine.diff(master, test); List<Difference> diffs = diffEngine.diff(master, test);
logger.info(String.format("Done computing diff with %d differences found", diffs.size())); logger.info(String.format("Done computing diff with %d differences found", diffs.size()));
if ( showItemizedDifferences ) { if ( showItemizedDifferences ) {
out.printf("Itemized results%n"); out.printf("Itemized results%n");
for ( Difference diff : diffs ) for ( Difference diff : diffs )
out.printf("DIFF: %s%n", diff.toString()); out.printf("DIFF: %s%n", diff.toString());
} }
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff, maxRawDiffsToSummary); DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out,
params.setDescending(false); MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff,
diffEngine.reportSummarizedDifferences(diffs, params); maxRawDiffsToSummary, doPairwise);
logger.info(String.format("Done summarizing differences")); params.setDescending(false);
diffEngine.reportSummarizedDifferences(diffs, params);
logger.info(String.format("Done summarizing differences"));
}
} }
} }

View File

@ -29,11 +29,13 @@ import org.broad.tribble.AbstractFeatureReader;
import org.broad.tribble.FeatureReader; import org.broad.tribble.FeatureReader;
import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.LineReader; import org.broad.tribble.readers.LineReader;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*; import java.io.*;
import java.util.Arrays;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
@ -79,9 +81,6 @@ public class VCFDiffableReader implements DiffableReader {
String prevName = ""; String prevName = "";
Iterator<VariantContext> it = reader.iterator(); Iterator<VariantContext> it = reader.iterator();
while ( it.hasNext() ) { while ( it.hasNext() ) {
if ( count++ > maxElementsToRead && maxElementsToRead != -1)
break;
VariantContext vc = it.next(); VariantContext vc = it.next();
String name = vc.getChr() + ":" + vc.getStart(); String name = vc.getChr() + ":" + vc.getStart();
if ( name.equals(prevName) ) { if ( name.equals(prevName) ) {
@ -109,9 +108,12 @@ public class VCFDiffableReader implements DiffableReader {
for (Genotype g : vc.getGenotypes() ) { for (Genotype g : vc.getGenotypes() ) {
DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot); DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot);
gRoot.add("GT", g.getGenotypeString()); gRoot.add("GT", g.getGenotypeString());
gRoot.add("GQ", g.hasLog10PError() ? g.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4 ); if ( g.hasGQ() ) gRoot.add("GQ", g.getGQ() );
if ( g.hasDP() ) gRoot.add("DP", g.getDP() );
if ( g.hasAD() ) gRoot.add("AD", Utils.join(",", g.getAD()));
if ( g.hasPL() ) gRoot.add("PL", Utils.join(",", g.getPL()));
for (Map.Entry<String, Object> attribute : g.getAttributes().entrySet()) { for (Map.Entry<String, Object> attribute : g.getExtendedAttributes().entrySet()) {
if ( ! attribute.getKey().startsWith("_") ) if ( ! attribute.getKey().startsWith("_") )
gRoot.add(attribute.getKey(), attribute.getValue()); gRoot.add(attribute.getKey(), attribute.getValue());
} }
@ -120,6 +122,9 @@ public class VCFDiffableReader implements DiffableReader {
} }
root.add(vcRoot); root.add(vcRoot);
count += vcRoot.size();
if ( count > maxElementsToRead && maxElementsToRead != -1)
break;
} }
reader.close(); reader.close();

View File

@ -297,13 +297,14 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
// for each genotype, check filters then create a new object // for each genotype, check filters then create a new object
for ( final Genotype g : vc.getGenotypes() ) { for ( final Genotype g : vc.getGenotypes() ) {
if ( g.isCalled() ) { if ( g.isCalled() ) {
Set<String> filters = new LinkedHashSet<String>(g.getFilters()); List<String> filters = new ArrayList<String>(g.getFilters());
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) { for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
if ( VariantContextUtils.match(vc, g, exp) ) if ( VariantContextUtils.match(vc, g, exp) )
filters.add(exp.name); filters.add(exp.name);
} }
genotypes.add(new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), filters, g.getAttributes(), g.isPhased()));
genotypes.add(new GenotypeBuilder(g).filters(filters).make());
} else { } else {
genotypes.add(g); genotypes.add(g);
} }

View File

@ -98,11 +98,9 @@ public class ConsensusAlleleCounter {
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) { for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
final AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); final AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
if ( context.hasBasePileup() ) { final ReadBackedPileup indelPileup = context.getBasePileup();
final ReadBackedPileup indelPileup = context.getBasePileup(); insCount += indelPileup.getNumberOfInsertionsAfterThisElement();
insCount += indelPileup.getNumberOfInsertionsAfterThisElement(); delCount += indelPileup.getNumberOfDeletionsAfterThisElement();
delCount += indelPileup.getNumberOfDeletionsAfterThisElement();
}
} }
if ( insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping ) if ( insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping )
@ -112,9 +110,6 @@ public class ConsensusAlleleCounter {
// todo -- warning, can be duplicating expensive partition here // todo -- warning, can be duplicating expensive partition here
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
if ( !context.hasBasePileup() )
continue;
final ReadBackedPileup indelPileup = context.getBasePileup(); final ReadBackedPileup indelPileup = context.getBasePileup();
final int nIndelReads = indelPileup.getNumberOfInsertionsAfterThisElement() + indelPileup.getNumberOfDeletionsAfterThisElement(); final int nIndelReads = indelPileup.getNumberOfInsertionsAfterThisElement() + indelPileup.getNumberOfDeletionsAfterThisElement();

View File

@ -89,7 +89,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
* @param ref reference context * @param ref reference context
* @param contexts stratified alignment contexts * @param contexts stratified alignment contexts
* @param contextType stratified context type * @param contextType stratified context type
* @param alternateAllelesToUse the alternate allele to use, null if not set * @param allAllelesToUse the alternate allele to use, null if not set
* @param useBAQedPileup should we use the BAQed pileup or the raw one? * @param useBAQedPileup should we use the BAQed pileup or the raw one?
* @param locParser Genome Loc Parser * @param locParser Genome Loc Parser
* @return variant context where genotypes are no-called but with GLs * @return variant context where genotypes are no-called but with GLs
@ -98,7 +98,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
final ReferenceContext ref, final ReferenceContext ref,
final Map<String, AlignmentContext> contexts, final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType, final AlignmentContextUtils.ReadOrientation contextType,
final List<Allele> alternateAllelesToUse, final List<Allele> allAllelesToUse,
final boolean useBAQedPileup, final boolean useBAQedPileup,
final GenomeLocParser locParser); final GenomeLocParser locParser);

View File

@ -35,8 +35,7 @@ import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.*; import org.broadinstitute.sting.utils.variantcontext.*;
@ -44,14 +43,13 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*; import java.util.*;
public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
private final int HAPLOTYPE_SIZE; private static final int HAPLOTYPE_SIZE = 80;
private final boolean getAlleleListFromVCF;
private boolean DEBUG = false; private boolean DEBUG = false;
private boolean ignoreSNPAllelesWhenGenotypingIndels = false; private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
private PairHMMIndelErrorModel pairModel; private PairHMMIndelErrorModel pairModel;
private boolean allelesArePadded;
private static ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>> indelLikelihoodMap = private static ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>> indelLikelihoodMap =
new ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>>() { new ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>>() {
protected synchronized HashMap<PileupElement, LinkedHashMap<Allele, Double>> initialValue() { protected synchronized HashMap<PileupElement, LinkedHashMap<Allele, Double>> initialValue() {
@ -75,124 +73,56 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
super(UAC, logger); super(UAC, logger);
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
haplotypeMap = new LinkedHashMap<Allele, Haplotype>(); haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
} }
protected List<Allele> computeConsensusAlleles(ReferenceContext ref, protected static List<Allele> computeConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts, Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType, AlignmentContextUtils.ReadOrientation contextType,
GenomeLocParser locParser) { GenomeLocParser locParser, UnifiedArgumentCollection UAC) {
ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
return counter.computeConsensusAlleles(ref, contexts, contextType); return counter.computeConsensusAlleles(ref, contexts, contextType);
} }
private final static EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED); private final static EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED);
public VariantContext getLikelihoods(final RefMetaDataTracker tracker, public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
final ReferenceContext ref, final ReferenceContext ref,
final Map<String, AlignmentContext> contexts, final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType, final AlignmentContextUtils.ReadOrientation contextType,
final List<Allele> alternateAllelesToUse, final List<Allele> allAllelesToUse,
final boolean useBAQedPileup, final boolean useBAQedPileup,
final GenomeLocParser locParser) { final GenomeLocParser locParser) {
if (tracker == null)
return null;
GenomeLoc loc = ref.getLocus(); GenomeLoc loc = ref.getLocus();
Allele refAllele, altAllele; // if (!ref.getLocus().equals(lastSiteVisited)) {
VariantContext vc = null; if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
boolean allelesArePadded = true;
if (!ref.getLocus().equals(lastSiteVisited)) {
// starting a new site: clear allele list // starting a new site: clear allele list
alleleList.clear();
lastSiteVisited = ref.getLocus(); lastSiteVisited = ref.getLocus();
indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>()); indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>());
haplotypeMap.clear(); haplotypeMap.clear();
if (getAlleleListFromVCF) { Pair<List<Allele>,Boolean> pair = getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC, ignoreSNPAllelesWhenGenotypingIndels);
for (final VariantContext vc_input : tracker.getValues(UAC.alleles, loc)) { alleleList = pair.first;
if (vc_input != null && allelesArePadded = pair.second;
allowableTypes.contains(vc_input.getType()) && if (alleleList.isEmpty())
ref.getLocus().getStart() == vc_input.getStart()) { return null;
vc = vc_input;
break;
}
}
// ignore places where we don't have a variant
if (vc == null)
return null;
alleleList.clear();
if (ignoreSNPAllelesWhenGenotypingIndels) {
// if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it
for (Allele a : vc.getAlleles())
if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length)
continue;
else
alleleList.add(a);
} else {
for (Allele a : vc.getAlleles())
alleleList.add(a);
}
if (vc.getReference().getBases().length == vc.getEnd()-vc.getStart()+1)
allelesArePadded = false;
} else {
alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser);
if (alleleList.isEmpty())
return null;
}
}
// protect against having an indel too close to the edge of a contig
if (loc.getStart() <= HAPLOTYPE_SIZE)
return null;
// check if there is enough reference window to create haplotypes (can be an issue at end of contigs)
if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE)
return null;
if (alleleList.isEmpty())
return null;
refAllele = alleleList.get(0);
altAllele = alleleList.get(1);
// look for alt allele that has biggest length distance to ref allele
int maxLenDiff = 0;
for (Allele a : alleleList) {
if (a.isNonReference()) {
int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length());
if (lenDiff > maxLenDiff) {
maxLenDiff = lenDiff;
altAllele = a;
}
}
} }
final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1;
final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1;
if (hsize <= 0) { getHaplotypeMapFromAlleles(alleleList, ref, loc, haplotypeMap); // will update haplotypeMap adding elements
logger.warn(String.format("Warning: event at location %s can't be genotyped, skipping", loc.toString())); if (haplotypeMap == null || haplotypeMap.isEmpty())
return null; return null;
}
haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
ref, hsize, numPrefBases);
// start making the VariantContext // start making the VariantContext
// For all non-snp VC types, VC end location is just startLocation + length of ref allele including padding base. // For all non-snp VC types, VC end location is just startLocation + length of ref allele including padding base.
int endLoc = loc.getStart() + refAllele.length()-1;
if (allelesArePadded) final int endLoc = computeEndLocation(alleleList, loc,allelesArePadded);
endLoc++; final int eventLength = getEventLength(alleleList);
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList).referenceBaseForIndel(ref.getBase()); final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList).referenceBaseForIndel(ref.getBase());
// create the genotypes; no-call everyone for now // create the genotypes; no-call everyone for now
@ -206,23 +136,19 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) { for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
if (context.hasBasePileup()) { final ReadBackedPileup pileup = context.getBasePileup();
final ReadBackedPileup pileup = context.getBasePileup(); if (pileup != null) {
if (pileup != null) { final GenotypeBuilder b = new GenotypeBuilder(sample.getKey());
final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods); b.PL(genotypeLikelihoods);
b.DP(getFilteredDepth(pileup));
genotypes.add(b.make());
HashMap<String, Object> attributes = new HashMap<String, Object>(); if (DEBUG) {
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup)); System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString());
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); for (int k = 0; k < genotypeLikelihoods.length; k++)
genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); System.out.format("%1.4f ", genotypeLikelihoods[k]);
System.out.println();
if (DEBUG) {
System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString());
for (int k = 0; k < genotypeLikelihoods.length; k++)
System.out.format("%1.4f ", genotypeLikelihoods[k]);
System.out.println();
}
} }
} }
} }
@ -234,6 +160,102 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
return indelLikelihoodMap.get(); return indelLikelihoodMap.get();
} }
public static int computeEndLocation(final List<Allele> alleles, final GenomeLoc loc, final boolean allelesArePadded) {
Allele refAllele = alleles.get(0);
int endLoc = loc.getStart() + refAllele.length()-1;
if (allelesArePadded)
endLoc++;
return endLoc;
}
public static void getHaplotypeMapFromAlleles(final List<Allele> alleleList,
final ReferenceContext ref,
final GenomeLoc loc,
final LinkedHashMap<Allele, Haplotype> haplotypeMap) {
// protect against having an indel too close to the edge of a contig
if (loc.getStart() <= HAPLOTYPE_SIZE)
haplotypeMap.clear();
// check if there is enough reference window to create haplotypes (can be an issue at end of contigs)
else if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE)
haplotypeMap.clear();
else if (alleleList.isEmpty())
haplotypeMap.clear();
else {
final int eventLength = getEventLength(alleleList);
final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1;
final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1;
haplotypeMap.putAll(Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
ref, hsize, numPrefBases));
}
}
public static int getEventLength(List<Allele> alleleList) {
Allele refAllele = alleleList.get(0);
Allele altAllele = alleleList.get(1);
// look for alt allele that has biggest length distance to ref allele
int maxLenDiff = 0;
for (Allele a : alleleList) {
if (a.isNonReference()) {
int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length());
if (lenDiff > maxLenDiff) {
maxLenDiff = lenDiff;
altAllele = a;
}
}
}
return altAllele.getBaseString().length() - refAllele.getBaseString().length();
}
public static Pair<List<Allele>,Boolean> getInitialAlleleList(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final GenomeLocParser locParser,
final UnifiedArgumentCollection UAC,
final boolean ignoreSNPAllelesWhenGenotypingIndels) {
List<Allele> alleles = new ArrayList<Allele>();
boolean allelesArePadded = true;
if (UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
VariantContext vc = null;
for (final VariantContext vc_input : tracker.getValues(UAC.alleles, ref.getLocus())) {
if (vc_input != null &&
allowableTypes.contains(vc_input.getType()) &&
ref.getLocus().getStart() == vc_input.getStart()) {
vc = vc_input;
break;
}
}
// ignore places where we don't have a variant
if (vc == null)
return new Pair<List<Allele>,Boolean>(alleles,false);
if (ignoreSNPAllelesWhenGenotypingIndels) {
// if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it
for (Allele a : vc.getAlleles())
if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length)
continue;
else
alleles.add(a);
} else {
alleles.addAll(vc.getAlleles());
}
if ( vc.getReference().getBases().length == vc.getEnd()-vc.getStart()+1)
allelesArePadded = false;
} else {
alleles = IndelGenotypeLikelihoodsCalculationModel.computeConsensusAlleles(ref, contexts, contextType, locParser, UAC);
}
return new Pair<List<Allele>,Boolean> (alleles,allelesArePadded);
}
// Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup, // Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup,
// so that per-sample DP will include deletions covering the event. // so that per-sample DP will include deletions covering the event.
protected int getFilteredDepth(ReadBackedPileup pileup) { protected int getFilteredDepth(ReadBackedPileup pileup) {

View File

@ -62,7 +62,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
final ReferenceContext ref, final ReferenceContext ref,
final Map<String, AlignmentContext> contexts, final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType, final AlignmentContextUtils.ReadOrientation contextType,
final List<Allele> alternateAllelesToUse, final List<Allele> allAllelesToUse,
final boolean useBAQedPileup, final boolean useBAQedPileup,
final GenomeLocParser locParser) { final GenomeLocParser locParser) {
@ -70,11 +70,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase); final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase);
final Allele refAllele = Allele.create(refBase, true); final Allele refAllele = Allele.create(refBase, true);
// start making the VariantContext
final GenomeLoc loc = ref.getLocus();
final List<Allele> alleles = new ArrayList<Allele>();
alleles.add(refAllele);
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles);
// calculate the GLs // calculate the GLs
ArrayList<SampleGenotypeData> GLs = new ArrayList<SampleGenotypeData>(contexts.size()); ArrayList<SampleGenotypeData> GLs = new ArrayList<SampleGenotypeData>(contexts.size());
@ -90,9 +85,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup))); GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup)));
} }
// start making the VariantContext
final GenomeLoc loc = ref.getLocus();
final List<Allele> alleles = new ArrayList<Allele>();
alleles.add(refAllele);
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles);
// find the alternate allele(s) that we should be using // find the alternate allele(s) that we should be using
if ( alternateAllelesToUse != null ) { if ( allAllelesToUse != null ) {
alleles.addAll(alternateAllelesToUse); alleles.addAll(allAllelesToUse.subList(1,allAllelesToUse.size())); // this includes ref allele
} else if ( useAlleleFromVCF ) { } else if ( useAlleleFromVCF ) {
final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
@ -156,12 +158,11 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
myLikelihoods[i] = allLikelihoods[PLordering[i]]; myLikelihoods[i] = allLikelihoods[PLordering[i]];
// normalize in log space so that max element is zero. // normalize in log space so that max element is zero.
final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true)); final GenotypeBuilder gb = new GenotypeBuilder(sampleData.name);
final double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(myLikelihoods, false, true);
final HashMap<String, Object> attributes = new HashMap<String, Object>(); gb.PL(genotypeLikelihoods);
attributes.put(VCFConstants.DEPTH_KEY, sampleData.depth); gb.DP(sampleData.depth);
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); genotypes.add(gb.make());
genotypes.add(new Genotype(sampleData.name, noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
} }
return builder.genotypes(genotypes).make(); return builder.genotypes(genotypes).make();

View File

@ -65,18 +65,15 @@ public class UnifiedArgumentCollection {
/** /**
* The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
* confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
* is the default). Note that the confidence (QUAL) values for multi-sample low-pass (e.g. 4x per sample) calling might * is the default).
* be significantly smaller with the new EXACT model than with our older GRID_SEARCH model, as the latter tended to
* over-estimate the confidence; for low-pass calling we tend to use much smaller thresholds (e.g. 4).
*/ */
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false) @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false)
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
/** /**
* the minimum phred-scaled Qscore threshold to emit low confidence calls. Genotypes with confidence >= this but less * This argument allows you to emit low quality calls as filtered records.
* than the calling threshold are emitted but marked as filtered.
*/ */
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false)
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
/** /**

View File

@ -252,7 +252,7 @@ public class UnifiedGenotyperEngine {
vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make(); vc = new VariantContextBuilder("UG_call", ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make();
} }
if ( annotationEngine != null && rawContext.hasBasePileup() ) { if ( annotationEngine != null ) {
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
final ReadBackedPileup pileup = rawContext.getBasePileup(); final ReadBackedPileup pileup = rawContext.getBasePileup();
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
@ -378,10 +378,10 @@ public class UnifiedGenotyperEngine {
double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
List<Allele> alternateAllelesToUse = builder.make().getAlternateAlleles(); List<Allele> allAllelesToUse = builder.make().getAlleles();
// the forward lod // the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model); VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model);
AFresult.reset(); AFresult.reset();
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
@ -390,7 +390,7 @@ public class UnifiedGenotyperEngine {
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod // the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model); VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model);
AFresult.reset(); AFresult.reset();
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
@ -422,7 +422,7 @@ public class UnifiedGenotyperEngine {
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed
vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);
if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) { if ( annotationEngine != null && !limitedContext ) {
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
final ReadBackedPileup pileup = rawContext.getBasePileup(); final ReadBackedPileup pileup = rawContext.getBasePileup();
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
@ -441,7 +441,7 @@ public class UnifiedGenotyperEngine {
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) {
if ( !BaseUtils.isRegularBase(refContext.getBase()) || !rawContext.hasBasePileup() ) if ( !BaseUtils.isRegularBase(refContext.getBase()) )
return null; return null;
Map<String, AlignmentContext> stratifiedContexts = null; Map<String, AlignmentContext> stratifiedContexts = null;
@ -507,9 +507,7 @@ public class UnifiedGenotyperEngine {
int depth = 0; int depth = 0;
if ( isCovered ) { if ( isCovered ) {
AlignmentContext context = contexts.get(sample); depth = contexts.get(sample).getBasePileup().depthOfCoverage();
if ( context.hasBasePileup() )
depth = context.getBasePileup().depthOfCoverage();
} }
P_of_ref *= 1.0 - (theta / 2.0) * getRefBinomialProb(depth); P_of_ref *= 1.0 - (theta / 2.0) * getRefBinomialProb(depth);
@ -571,37 +569,35 @@ public class UnifiedGenotyperEngine {
final List<GenotypeLikelihoodsCalculationModel.Model> models = new ArrayList<GenotypeLikelihoodsCalculationModel.Model>(2); final List<GenotypeLikelihoodsCalculationModel.Model> models = new ArrayList<GenotypeLikelihoodsCalculationModel.Model>(2);
if ( rawContext.hasBasePileup() ) { // if we're genotyping given alleles and we have a requested SNP at this position, do SNP
// if we're genotyping given alleles and we have a requested SNP at this position, do SNP if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { final VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles);
final VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); if ( vcInput == null )
if ( vcInput == null ) return models;
return models;
if ( vcInput.isSNP() ) { if ( vcInput.isSNP() ) {
// ignore SNPs if the user chose INDEL mode only // ignore SNPs if the user chose INDEL mode only
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH )
models.add(GenotypeLikelihoodsCalculationModel.Model.SNP); models.add(GenotypeLikelihoodsCalculationModel.Model.SNP);
else if ( UAC.GLmodel.name().toUpperCase().contains("SNP") ) else if ( UAC.GLmodel.name().toUpperCase().contains("SNP") )
models.add(UAC.GLmodel); models.add(UAC.GLmodel);
} }
else if ( vcInput.isIndel() || vcInput.isMixed() ) { else if ( vcInput.isIndel() || vcInput.isMixed() ) {
// ignore INDELs if the user chose SNP mode only // ignore INDELs if the user chose SNP mode only
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH )
models.add(GenotypeLikelihoodsCalculationModel.Model.INDEL); models.add(GenotypeLikelihoodsCalculationModel.Model.INDEL);
else if (UAC.GLmodel.name().toUpperCase().contains("INDEL")) else if (UAC.GLmodel.name().toUpperCase().contains("INDEL"))
models.add(UAC.GLmodel); models.add(UAC.GLmodel);
} }
// No support for other types yet // No support for other types yet
}
else {
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) {
models.add(GenotypeLikelihoodsCalculationModel.Model.SNP);
models.add(GenotypeLikelihoodsCalculationModel.Model.INDEL);
} }
else { else {
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH ) { models.add(UAC.GLmodel);
models.add(GenotypeLikelihoodsCalculationModel.Model.SNP);
models.add(GenotypeLikelihoodsCalculationModel.Model.INDEL);
}
else {
models.add(UAC.GLmodel);
}
} }
} }

View File

@ -117,7 +117,7 @@ public class PairHMMIndelErrorModel {
} }
static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) { static private void getContextHomopolymerLength(final byte[] refBytes, final int[] hrunArray) {
// compute forward hrun length, example: // compute forward hrun length, example:
// AGGTGACCCCCCTGAGAG // AGGTGACCCCCCTGAGAG
// 001000012345000000 // 001000012345000000
@ -164,10 +164,24 @@ public class PairHMMIndelErrorModel {
} }
} }
} }
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap, ReferenceContext ref, int eventLength, HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
public synchronized double[] computeDiploidReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele, Haplotype> haplotypeMap, ReferenceContext ref, int eventLength, HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap){
final int numHaplotypes = haplotypeMap.size(); final int numHaplotypes = haplotypeMap.size();
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][numHaplotypes];
final int readCounts[] = new int[pileup.getNumberOfElements()]; final int readCounts[] = new int[pileup.getNumberOfElements()];
final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, indelLikelihoodMap, readCounts);
return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
}
public synchronized double[][] computeGeneralReadHaplotypeLikelihoods(final ReadBackedPileup pileup,
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
final ReferenceContext ref,
final int eventLength,
final HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap,
final int[] readCounts) {
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];
final PairHMM pairHMM = new PairHMM(bandedLikelihoods); final PairHMM pairHMM = new PairHMM(bandedLikelihoods);
int readIdx=0; int readIdx=0;
@ -367,7 +381,7 @@ public class PairHMMIndelErrorModel {
} }
return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); return readLikelihoods;
} }
private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) { private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) {
@ -385,7 +399,7 @@ public class PairHMMIndelErrorModel {
return b1.length; return b1.length;
} }
private static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
// todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix

View File

@ -185,38 +185,36 @@ public class RealignerTargetCreator extends RodWalker<RealignerTargetCreator.Eve
} }
// look at the normal context to get deletions and positions with high entropy // look at the normal context to get deletions and positions with high entropy
if ( context.hasBasePileup() ) { final ReadBackedPileup pileup = context.getBasePileup();
final ReadBackedPileup pileup = context.getBasePileup();
int mismatchQualities = 0, totalQualities = 0; int mismatchQualities = 0, totalQualities = 0;
final byte refBase = ref.getBase(); final byte refBase = ref.getBase();
for ( PileupElement p : pileup ) { for ( PileupElement p : pileup ) {
// check the ends of the reads to see how far they extend // check the ends of the reads to see how far they extend
furthestStopPos = Math.max(furthestStopPos, p.getRead().getAlignmentEnd()); furthestStopPos = Math.max(furthestStopPos, p.getRead().getAlignmentEnd());
// is it a deletion or insertion? // is it a deletion or insertion?
if ( p.isDeletion() || p.isBeforeInsertion() ) { if ( p.isDeletion() || p.isBeforeInsertion() ) {
hasIndel = true; hasIndel = true;
if ( p.isBeforeInsertion() ) if ( p.isBeforeInsertion() )
hasInsertion = true; hasInsertion = true;
}
// look for mismatches
else if ( lookForMismatchEntropy ) {
if ( p.getBase() != refBase )
mismatchQualities += p.getQual();
totalQualities += p.getQual();
}
} }
// make sure we're supposed to look for high entropy // look for mismatches
if ( lookForMismatchEntropy && else if ( lookForMismatchEntropy ) {
pileup.getNumberOfElements() >= minReadsAtLocus && if ( p.getBase() != refBase )
(double)mismatchQualities / (double)totalQualities >= mismatchThreshold ) mismatchQualities += p.getQual();
hasPointEvent = true; totalQualities += p.getQual();
}
} }
// make sure we're supposed to look for high entropy
if ( lookForMismatchEntropy &&
pileup.getNumberOfElements() >= minReadsAtLocus &&
(double)mismatchQualities / (double)totalQualities >= mismatchThreshold )
hasPointEvent = true;
// return null if no event occurred // return null if no event occurred
if ( !hasIndel && !hasPointEvent ) if ( !hasIndel && !hasPointEvent )
return null; return null;

View File

@ -316,6 +316,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
// first, the basic info // first, the basic info
headerInfo.add(new VCFHeaderLine("source", "SomaticIndelDetector")); headerInfo.add(new VCFHeaderLine("source", "SomaticIndelDetector"));
headerInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); headerInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
// FORMAT and INFO fields // FORMAT and INFO fields
// headerInfo.addAll(VCFUtils.getSupportedHeaderStrings()); // headerInfo.addAll(VCFUtils.getSupportedHeaderStrings());
@ -616,7 +617,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
throw new StingException("Read "+read.getReadName()+" from "+getToolkit().getSourceFileForReaderID(getToolkit().getReaderIDForRead(read))+ throw new StingException("Read "+read.getReadName()+" from "+getToolkit().getSourceFileForReaderID(getToolkit().getReaderIDForRead(read))+
"has no Normal/Tumor tag associated with it"); "has no Normal/Tumor tag associated with it");
// String rg = (String)read.getAttribute("RG"); // String rg = (String)read.getExtendedAttribute("RG");
// if ( rg == null ) // if ( rg == null )
// throw new UserException.MalformedBam(read, "Read "+read.getReadName()+" has no read group in merged stream. RG is required for somatic calls."); // throw new UserException.MalformedBam(read, "Read "+read.getReadName()+" has no read group in merged stream. RG is required for somatic calls.");
@ -1147,13 +1148,12 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
GenotypesContext genotypes = GenotypesContext.create(); GenotypesContext genotypes = GenotypesContext.create();
for ( String sample : normalSamples ) { for ( String sample : normalSamples ) {
final GenotypeBuilder gb = new GenotypeBuilder(sample);
Map<String,Object> attrs = call.makeStatsAttributes(null); gb.attributes(call.makeStatsAttributes(null));
gb.alleles(! discard_event
if ( ! discard_event ) // we made a call - put actual het genotype here: ? alleles // we made a call - put actual het genotype here:
genotypes.add(new Genotype(sample,alleles,Genotype.NO_LOG10_PERROR,null,attrs,false)); : homref_alleles); // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all)
else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all) genotypes.add(gb.make());
genotypes.add(new Genotype(sample, homref_alleles,Genotype.NO_LOG10_PERROR,null,attrs,false));
} }
Set<String> filters = null; Set<String> filters = null;
@ -1237,11 +1237,11 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
GenotypesContext genotypes = GenotypesContext.create(); GenotypesContext genotypes = GenotypesContext.create();
for ( String sample : normalSamples ) { for ( String sample : normalSamples ) {
genotypes.add(new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_LOG10_PERROR,null,attrsNormal,false)); genotypes.add(GenotypeBuilder.create(sample, homRefN ? homRefAlleles : alleles, attrsNormal));
} }
for ( String sample : tumorSamples ) { for ( String sample : tumorSamples ) {
genotypes.add(new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_LOG10_PERROR,null,attrsTumor,false) ); genotypes.add(GenotypeBuilder.create(sample, homRefT ? homRefAlleles : alleles, attrsTumor));
} }
Set<String> filters = null; Set<String> filters = null;
@ -2143,7 +2143,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
class VCFIndelAttributes { class VCFIndelAttributes {
public static String ALLELIC_DEPTH_KEY = "AD"; public static String ALLELIC_DEPTH_KEY = VCFConstants.GENOTYPE_ALLELE_DEPTHS;
public static String DEPTH_TOTAL_KEY = VCFConstants.DEPTH_KEY; public static String DEPTH_TOTAL_KEY = VCFConstants.DEPTH_KEY;
public static String MAPQ_KEY = "MQS"; public static String MAPQ_KEY = "MQS";

View File

@ -97,10 +97,10 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
private ArrayList<Sample> trios = new ArrayList<Sample>(); private ArrayList<Sample> trios = new ArrayList<Sample>();
//Matrix of priors for all genotype combinations //Matrix of priors for all genotype combinations
private EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>> mvCountMatrix; private EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>> mvCountMatrix;
//Matrix of allele transmission //Matrix of allele transmission
private EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>> transmissionMatrix; private EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,TrioPhase>>> transmissionMatrix;
//Metrics counters hash keys //Metrics counters hash keys
private final Byte NUM_TRIO_GENOTYPES_CALLED = 0; private final Byte NUM_TRIO_GENOTYPES_CALLED = 0;
@ -138,17 +138,17 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
private EnumMap<FamilyMember,Genotype> trioPhasedGenotypes = new EnumMap<FamilyMember, Genotype>(FamilyMember.class); private EnumMap<FamilyMember,Genotype> trioPhasedGenotypes = new EnumMap<FamilyMember, Genotype>(FamilyMember.class);
private ArrayList<Allele> getAlleles(Genotype.Type genotype){ private ArrayList<Allele> getAlleles(GenotypeType genotype){
ArrayList<Allele> alleles = new ArrayList<Allele>(2); ArrayList<Allele> alleles = new ArrayList<Allele>(2);
if(genotype == Genotype.Type.HOM_REF){ if(genotype == GenotypeType.HOM_REF){
alleles.add(REF); alleles.add(REF);
alleles.add(REF); alleles.add(REF);
} }
else if(genotype == Genotype.Type.HET){ else if(genotype == GenotypeType.HET){
alleles.add(REF); alleles.add(REF);
alleles.add(VAR); alleles.add(VAR);
} }
else if(genotype == Genotype.Type.HOM_VAR){ else if(genotype == GenotypeType.HOM_VAR){
alleles.add(VAR); alleles.add(VAR);
alleles.add(VAR); alleles.add(VAR);
} }
@ -158,27 +158,34 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
return alleles; return alleles;
} }
private boolean isPhasable(Genotype.Type genotype){ private boolean isPhasable(GenotypeType genotype){
return genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HET || genotype == Genotype.Type.HOM_VAR; return genotype == GenotypeType.HOM_REF || genotype == GenotypeType.HET || genotype == GenotypeType.HOM_VAR;
} }
//Create a new Genotype based on information from a single individual //Create a new Genotype based on information from a single individual
//Homozygous genotypes will be set as phased, heterozygous won't be //Homozygous genotypes will be set as phased, heterozygous won't be
private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){ private void phaseSingleIndividualAlleles(GenotypeType genotype, FamilyMember familyMember){
if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){ boolean phase = genotype == GenotypeType.HOM_REF || genotype == GenotypeType.HOM_VAR;
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_LOG10_PERROR, null, null, true)); trioPhasedGenotypes.put(familyMember, makeGenotype(genotype, phase));
} }
else
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_LOG10_PERROR,null,null,false)); private Genotype makeGenotype(final GenotypeType type, boolean phase) {
return makeGenotype(getAlleles(type), phase);
}
private Genotype makeGenotype(final List<Allele> alleles, boolean phase) {
final GenotypeBuilder gb = new GenotypeBuilder(DUMMY_NAME, alleles);
gb.phased(phase);
return gb.make();
} }
//Find the phase for a parent/child pair //Find the phase for a parent/child pair
private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){ private void phasePairAlleles(GenotypeType parentGenotype, GenotypeType childGenotype, FamilyMember parent){
//Special case for Het/Het as it is ambiguous //Special case for Het/Het as it is ambiguous
if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){ if(parentGenotype == GenotypeType.HET && childGenotype == GenotypeType.HET){
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_LOG10_PERROR, null, null, false)); trioPhasedGenotypes.put(parent, makeGenotype(parentGenotype, false));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false)); trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childGenotype, false));
return; return;
} }
@ -190,34 +197,34 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//If there is a possible phasing between the parent and child => phase //If there is a possible phasing between the parent and child => phase
int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0)); int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
if(childTransmittedAlleleIndex > -1){ if(childTransmittedAlleleIndex > -1){
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); trioPhasedGenotypes.put(parent, makeGenotype(parentAlleles, true));
childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
if(parent.equals(FamilyMember.MOTHER)) if(parent.equals(FamilyMember.MOTHER))
childPhasedAlleles.add(childAlleles.get(0)); childPhasedAlleles.add(childAlleles.get(0));
else else
childPhasedAlleles.add(0,childAlleles.get(0)); childPhasedAlleles.add(0,childAlleles.get(0));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAlleles, true));
} }
else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){ else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
parentPhasedAlleles.add(parentAlleles.get(1)); parentPhasedAlleles.add(parentAlleles.get(1));
parentPhasedAlleles.add(parentAlleles.get(0)); parentPhasedAlleles.add(parentAlleles.get(0));
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); trioPhasedGenotypes.put(parent, makeGenotype(parentPhasedAlleles, true));
childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex)); childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
if(parent.equals(FamilyMember.MOTHER)) if(parent.equals(FamilyMember.MOTHER))
childPhasedAlleles.add(childAlleles.get(0)); childPhasedAlleles.add(childAlleles.get(0));
else else
childPhasedAlleles.add(0,childAlleles.get(0)); childPhasedAlleles.add(0,childAlleles.get(0));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true)); trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAlleles, true));
} }
//This is a Mendelian Violation => Do not phase //This is a Mendelian Violation => Do not phase
else{ else{
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_LOG10_PERROR,null,null,false)); trioPhasedGenotypes.put(parent, makeGenotype(parentGenotype, false));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false)); trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childGenotype, false));
} }
} }
//Phases a family by transmission //Phases a family by transmission
private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ private void phaseFamilyAlleles(GenotypeType mother, GenotypeType father, GenotypeType child){
Set<ArrayList<Allele>> possiblePhasedChildGenotypes = new HashSet<ArrayList<Allele>>(); Set<ArrayList<Allele>> possiblePhasedChildGenotypes = new HashSet<ArrayList<Allele>>();
ArrayList<Allele> motherAlleles = getAlleles(mother); ArrayList<Allele> motherAlleles = getAlleles(mother);
@ -246,7 +253,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
motherPhasedAlleles.add(motherAlleles.get(0)); motherPhasedAlleles.add(motherAlleles.get(0));
else else
motherPhasedAlleles.add(motherAlleles.get(1)); motherPhasedAlleles.add(motherAlleles.get(1));
trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true)); trioPhasedGenotypes.put(FamilyMember.MOTHER, makeGenotype(motherPhasedAlleles, true));
//Create father's genotype //Create father's genotype
ArrayList<Allele> fatherPhasedAlleles = new ArrayList<Allele>(2); ArrayList<Allele> fatherPhasedAlleles = new ArrayList<Allele>(2);
@ -255,10 +262,10 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
fatherPhasedAlleles.add(fatherAlleles.get(0)); fatherPhasedAlleles.add(fatherAlleles.get(0));
else else
fatherPhasedAlleles.add(fatherAlleles.get(1)); fatherPhasedAlleles.add(fatherAlleles.get(1));
trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true)); trioPhasedGenotypes.put(FamilyMember.FATHER, makeGenotype(fatherPhasedAlleles,true));
//Create child's genotype //Create child's genotype
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_LOG10_PERROR,null,null,true)); trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAllelesAlleles,true));
//Once a phased combination is found; exit //Once a phased combination is found; exit
return; return;
@ -266,16 +273,16 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
} }
//If this is reached then no phasing could be found //If this is reached then no phasing could be found
trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_LOG10_PERROR,null,null,false)); trioPhasedGenotypes.put(FamilyMember.MOTHER, makeGenotype(mother,false));
trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_LOG10_PERROR,null,null,false)); trioPhasedGenotypes.put(FamilyMember.FATHER, makeGenotype(father,false));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_LOG10_PERROR,null,null,false)); trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(child,false));
} }
/* Constructor: Creates a conceptual trio genotype combination from the given genotypes. /* Constructor: Creates a conceptual trio genotype combination from the given genotypes.
If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair
or single individual. or single individual.
*/ */
public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ public TrioPhase(GenotypeType mother, GenotypeType father, GenotypeType child){
//Take care of cases where one or more family members are no call //Take care of cases where one or more family members are no call
if(!isPhasable(child)){ if(!isPhasable(child)){
@ -297,7 +304,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
phaseSingleIndividualAlleles(father, FamilyMember.FATHER); phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
} }
//Special case for Het/Het/Het as it is ambiguous //Special case for Het/Het/Het as it is ambiguous
else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){ else if(mother == GenotypeType.HET && father == GenotypeType.HET && child == GenotypeType.HET){
phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER); phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
phaseSingleIndividualAlleles(father, FamilyMember.FATHER); phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
phaseSingleIndividualAlleles(child, FamilyMember.CHILD); phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
@ -311,7 +318,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
if(fatherFAlleleFirst && trioPhasedGenotypes.get(FamilyMember.CHILD).isPhased()){ if(fatherFAlleleFirst && trioPhasedGenotypes.get(FamilyMember.CHILD).isPhased()){
ArrayList<Allele> childAlleles = new ArrayList<Allele>(trioPhasedGenotypes.get(FamilyMember.CHILD).getAlleles()); ArrayList<Allele> childAlleles = new ArrayList<Allele>(trioPhasedGenotypes.get(FamilyMember.CHILD).getAlleles());
childAlleles.add(childAlleles.remove(0)); childAlleles.add(childAlleles.remove(0));
trioPhasedGenotypes.put(FamilyMember.CHILD,new Genotype(DUMMY_NAME,childAlleles,Genotype.NO_LOG10_PERROR,null,null,true)); trioPhasedGenotypes.put(FamilyMember.CHILD,makeGenotype(childAlleles,true));
} }
} }
@ -347,7 +354,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Add the transmission probability //Add the transmission probability
Map<String, Object> genotypeAttributes = new HashMap<String, Object>(); Map<String, Object> genotypeAttributes = new HashMap<String, Object>();
genotypeAttributes.putAll(genotype.getAttributes()); genotypeAttributes.putAll(genotype.getExtendedAttributes());
if(transmissionProb>NO_TRANSMISSION_PROB) if(transmissionProb>NO_TRANSMISSION_PROB)
genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission); genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission);
@ -370,7 +377,10 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
else else
log10Error = genotype.getLikelihoods().getLog10GQ(phasedGenotype.getType()); log10Error = genotype.getLikelihoods().getLog10GQ(phasedGenotype.getType());
return new Genotype(genotype.getSampleName(), phasedAlleles, log10Error, null, genotypeAttributes, phasedGenotype.isPhased()); return new GenotypeBuilder(genotype).alleles(phasedAlleles)
.log10PError(log10Error)
.attributes(genotypeAttributes)
.phased(phasedGenotype.isPhased()).make();
} }
@ -438,15 +448,15 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Create the transmission matrices //Create the transmission matrices
private void buildMatrices(){ private void buildMatrices(){
mvCountMatrix = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>>(Genotype.Type.class); mvCountMatrix = new EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>>(GenotypeType.class);
transmissionMatrix = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>>(Genotype.Type.class); transmissionMatrix = new EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,TrioPhase>>>(GenotypeType.class);
for(Genotype.Type mother : Genotype.Type.values()){ for(GenotypeType mother : GenotypeType.values()){
mvCountMatrix.put(mother,new EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>(Genotype.Type.class)); mvCountMatrix.put(mother,new EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>(GenotypeType.class));
transmissionMatrix.put(mother,new EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>(Genotype.Type.class)); transmissionMatrix.put(mother,new EnumMap<GenotypeType,EnumMap<GenotypeType,TrioPhase>>(GenotypeType.class));
for(Genotype.Type father : Genotype.Type.values()){ for(GenotypeType father : GenotypeType.values()){
mvCountMatrix.get(mother).put(father,new EnumMap<Genotype.Type, Integer>(Genotype.Type.class)); mvCountMatrix.get(mother).put(father,new EnumMap<GenotypeType, Integer>(GenotypeType.class));
transmissionMatrix.get(mother).put(father,new EnumMap<Genotype.Type,TrioPhase>(Genotype.Type.class)); transmissionMatrix.get(mother).put(father,new EnumMap<GenotypeType,TrioPhase>(GenotypeType.class));
for(Genotype.Type child : Genotype.Type.values()){ for(GenotypeType child : GenotypeType.values()){
mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child)); mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child));
transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child)); transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child));
} }
@ -457,16 +467,16 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Returns the number of Mendelian Violations for a given genotype combination. //Returns the number of Mendelian Violations for a given genotype combination.
//If one of the parents genotype is missing, it will consider it as a parent/child pair //If one of the parents genotype is missing, it will consider it as a parent/child pair
//If the child genotype or both parents genotypes are missing, 0 is returned. //If the child genotype or both parents genotypes are missing, 0 is returned.
private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){ private int getCombinationMVCount(GenotypeType mother, GenotypeType father, GenotypeType child){
//Child is no call => No MV //Child is no call => No MV
if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE) if(child == GenotypeType.NO_CALL || child == GenotypeType.UNAVAILABLE)
return 0; return 0;
//Add parents with genotypes for the evaluation //Add parents with genotypes for the evaluation
ArrayList<Genotype.Type> parents = new ArrayList<Genotype.Type>(); ArrayList<GenotypeType> parents = new ArrayList<GenotypeType>();
if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE)) if (!(mother == GenotypeType.NO_CALL || mother == GenotypeType.UNAVAILABLE))
parents.add(mother); parents.add(mother);
if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE)) if (!(father == GenotypeType.NO_CALL || father == GenotypeType.UNAVAILABLE))
parents.add(father); parents.add(father);
//Both parents no calls => No MV //Both parents no calls => No MV
@ -477,35 +487,35 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
int parentsNumRefAlleles = 0; int parentsNumRefAlleles = 0;
int parentsNumAltAlleles = 0; int parentsNumAltAlleles = 0;
for(Genotype.Type parent : parents){ for(GenotypeType parent : parents){
if(parent == Genotype.Type.HOM_REF){ if(parent == GenotypeType.HOM_REF){
parentsNumRefAlleles++; parentsNumRefAlleles++;
} }
else if(parent == Genotype.Type.HET){ else if(parent == GenotypeType.HET){
parentsNumRefAlleles++; parentsNumRefAlleles++;
parentsNumAltAlleles++; parentsNumAltAlleles++;
} }
else if(parent == Genotype.Type.HOM_VAR){ else if(parent == GenotypeType.HOM_VAR){
parentsNumAltAlleles++; parentsNumAltAlleles++;
} }
} }
//Case Child is HomRef //Case Child is HomRef
if(child == Genotype.Type.HOM_REF){ if(child == GenotypeType.HOM_REF){
if(parentsNumRefAlleles == parents.size()) if(parentsNumRefAlleles == parents.size())
return 0; return 0;
else return (parents.size()-parentsNumRefAlleles); else return (parents.size()-parentsNumRefAlleles);
} }
//Case child is HomVar //Case child is HomVar
if(child == Genotype.Type.HOM_VAR){ if(child == GenotypeType.HOM_VAR){
if(parentsNumAltAlleles == parents.size()) if(parentsNumAltAlleles == parents.size())
return 0; return 0;
else return parents.size()-parentsNumAltAlleles; else return parents.size()-parentsNumAltAlleles;
} }
//Case child is Het //Case child is Het
if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2)) if(child == GenotypeType.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2))
return 0; return 0;
//MV //MV
@ -513,7 +523,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
} }
//Given two trio genotypes combinations, returns the number of different genotypes between the two combinations. //Given two trio genotypes combinations, returns the number of different genotypes between the two combinations.
private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){ private int countFamilyGenotypeDiff(GenotypeType motherOriginal,GenotypeType fatherOriginal,GenotypeType childOriginal,GenotypeType motherNew,GenotypeType fatherNew,GenotypeType childNew){
int count = 0; int count = 0;
if(motherOriginal!=motherNew) if(motherOriginal!=motherNew)
count++; count++;
@ -526,21 +536,21 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Get a Map of genotype likelihoods. //Get a Map of genotype likelihoods.
//In case of null, unavailable or no call, all likelihoods are 1/3. //In case of null, unavailable or no call, all likelihoods are 1/3.
private EnumMap<Genotype.Type,Double> getLikelihoodsAsMapSafeNull(Genotype genotype){ private EnumMap<GenotypeType,Double> getLikelihoodsAsMapSafeNull(Genotype genotype){
if(genotype == null || !genotype.isCalled()){ if(genotype == null || !genotype.isCalled()){
EnumMap<Genotype.Type,Double> likelihoods = new EnumMap<Genotype.Type, Double>(Genotype.Type.class); EnumMap<GenotypeType,Double> likelihoods = new EnumMap<GenotypeType, Double>(GenotypeType.class);
likelihoods.put(Genotype.Type.HOM_REF,1.0/3.0); likelihoods.put(GenotypeType.HOM_REF,1.0/3.0);
likelihoods.put(Genotype.Type.HET,1.0/3.0); likelihoods.put(GenotypeType.HET,1.0/3.0);
likelihoods.put(Genotype.Type.HOM_VAR,1.0/3.0); likelihoods.put(GenotypeType.HOM_VAR,1.0/3.0);
return likelihoods; return likelihoods;
} }
return genotype.getLikelihoods().getAsMap(true); return genotype.getLikelihoods().getAsMap(true);
} }
//Returns the Genotype.Type; returns UNVAILABLE if given null //Returns the GenotypeType; returns UNVAILABLE if given null
private Genotype.Type getTypeSafeNull(Genotype genotype){ private GenotypeType getTypeSafeNull(Genotype genotype){
if(genotype == null) if(genotype == null)
return Genotype.Type.UNAVAILABLE; return GenotypeType.UNAVAILABLE;
return genotype.getType(); return genotype.getType();
} }
@ -561,18 +571,18 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Always assign the first parent as the parent having genotype information in pairs //Always assign the first parent as the parent having genotype information in pairs
//Always assign the mother as the first parent in trios //Always assign the mother as the first parent in trios
int parentsCalled = 0; int parentsCalled = 0;
Map<Genotype.Type,Double> firstParentLikelihoods; Map<GenotypeType,Double> firstParentLikelihoods;
Map<Genotype.Type,Double> secondParentLikelihoods; Map<GenotypeType,Double> secondParentLikelihoods;
ArrayList<Genotype.Type> bestFirstParentGenotype = new ArrayList<Genotype.Type>(); ArrayList<GenotypeType> bestFirstParentGenotype = new ArrayList<GenotypeType>();
ArrayList<Genotype.Type> bestSecondParentGenotype = new ArrayList<Genotype.Type>(); ArrayList<GenotypeType> bestSecondParentGenotype = new ArrayList<GenotypeType>();
ArrayList<Genotype.Type> bestChildGenotype = new ArrayList<Genotype.Type>(); ArrayList<GenotypeType> bestChildGenotype = new ArrayList<GenotypeType>();
Genotype.Type pairSecondParentGenotype = null; GenotypeType pairSecondParentGenotype = null;
if(mother == null || !mother.isCalled()){ if(mother == null || !mother.isCalled()){
firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father); firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother); secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
bestFirstParentGenotype.add(getTypeSafeNull(father)); bestFirstParentGenotype.add(getTypeSafeNull(father));
bestSecondParentGenotype.add(getTypeSafeNull(mother)); bestSecondParentGenotype.add(getTypeSafeNull(mother));
pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType(); pairSecondParentGenotype = mother == null ? GenotypeType.UNAVAILABLE : mother.getType();
if(father != null && father.isCalled()) if(father != null && father.isCalled())
parentsCalled = 1; parentsCalled = 1;
} }
@ -583,12 +593,12 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
bestSecondParentGenotype.add(getTypeSafeNull(father)); bestSecondParentGenotype.add(getTypeSafeNull(father));
if(father == null || !father.isCalled()){ if(father == null || !father.isCalled()){
parentsCalled = 1; parentsCalled = 1;
pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType(); pairSecondParentGenotype = father == null ? GenotypeType.UNAVAILABLE : father.getType();
}else{ }else{
parentsCalled = 2; parentsCalled = 2;
} }
} }
Map<Genotype.Type,Double> childLikelihoods = getLikelihoodsAsMapSafeNull(child); Map<GenotypeType,Double> childLikelihoods = getLikelihoodsAsMapSafeNull(child);
bestChildGenotype.add(getTypeSafeNull(child)); bestChildGenotype.add(getTypeSafeNull(child));
//Prior vars //Prior vars
@ -604,9 +614,9 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
int mvCount; int mvCount;
int cumulativeMVCount = 0; int cumulativeMVCount = 0;
double configurationLikelihood = 0; double configurationLikelihood = 0;
for(Map.Entry<Genotype.Type,Double> childGenotype : childLikelihoods.entrySet()){ for(Map.Entry<GenotypeType,Double> childGenotype : childLikelihoods.entrySet()){
for(Map.Entry<Genotype.Type,Double> firstParentGenotype : firstParentLikelihoods.entrySet()){ for(Map.Entry<GenotypeType,Double> firstParentGenotype : firstParentLikelihoods.entrySet()){
for(Map.Entry<Genotype.Type,Double> secondParentGenotype : secondParentLikelihoods.entrySet()){ for(Map.Entry<GenotypeType,Double> secondParentGenotype : secondParentLikelihoods.entrySet()){
mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey()); mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey());
//For parent/child pairs, sum over the possible genotype configurations of the missing parent //For parent/child pairs, sum over the possible genotype configurations of the missing parent
if(parentsCalled<2){ if(parentsCalled<2){
@ -797,9 +807,9 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters); updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s", mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",
vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(), vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),
phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoodsString(), phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),Arrays.asList(phasedMother.getAD()),
phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoodsString(), phasedMother.getLikelihoodsString(), phasedFather.getGenotypeString(),phasedFather.getDP(),Arrays.asList(phasedFather.getAD()),phasedFather.getLikelihoodsString(),
phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoodsString()); phasedChild.getGenotypeString(),Arrays.asList(phasedChild.getDP()),phasedChild.getAD(),phasedChild.getLikelihoodsString());
if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
} }
@ -809,8 +819,8 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s", mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",
vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(), vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),
phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoodsString(), phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),Arrays.asList(phasedMother.getAD()),phasedMother.getLikelihoodsString(),
phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoodsString()); phasedChild.getGenotypeString(),phasedChild.getDP(),Arrays.asList(phasedChild.getAD()),phasedChild.getLikelihoodsString());
} }
} }
else{ else{
@ -820,8 +830,8 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s", mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",
vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(), vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),
phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoodsString(), phasedFather.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getDP(),Arrays.asList(phasedFather.getAD()),phasedFather.getLikelihoodsString(),
phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoodsString()); phasedChild.getGenotypeString(),phasedChild.getDP(),Arrays.asList(phasedChild.getAD()),phasedChild.getLikelihoodsString());
} }
//Report violation if set so //Report violation if set so

View File

@ -109,14 +109,13 @@ class PhasingUtils {
} }
double mergedGQ = Math.max(gt1.getLog10PError(), gt2.getLog10PError()); double mergedGQ = Math.max(gt1.getLog10PError(), gt2.getLog10PError());
Set<String> mergedGtFilters = new HashSet<String>(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered
Map<String, Object> mergedGtAttribs = new HashMap<String, Object>(); Map<String, Object> mergedGtAttribs = new HashMap<String, Object>();
PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2); PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2);
if (phaseQual.PQ != null) if (phaseQual.PQ != null)
mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ); mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ);
Genotype mergedGt = new Genotype(gt1.getSampleName(), mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased); Genotype mergedGt = new GenotypeBuilder(gt1.getSampleName(), mergedAllelesForSample).log10PError(mergedGQ).attributes(mergedGtAttribs).phased(phaseQual.isPhased).make();
mergedGenotypes.add(mergedGt); mergedGenotypes.add(mergedGt);
} }

View File

@ -269,10 +269,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
logger.debug("Unprocessed variant = " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc)); logger.debug("Unprocessed variant = " + VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(), vc));
} }
int numReads = 0; int numReads = context.getBasePileup().getNumberOfElements();
if (context.hasBasePileup()) {
numReads = context.getBasePileup().getNumberOfElements();
}
PhasingStats addInPhaseStats = new PhasingStats(numReads, 1); PhasingStats addInPhaseStats = new PhasingStats(numReads, 1);
phaseStats.addIn(addInPhaseStats); phaseStats.addIn(addInPhaseStats);
} }
@ -288,7 +285,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
private VariantContext reduceVCToSamples(VariantContext vc, Set<String> samplesToPhase) { private VariantContext reduceVCToSamples(VariantContext vc, Set<String> samplesToPhase) {
// for ( String sample : samplesToPhase ) // for ( String sample : samplesToPhase )
// logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() )); // logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() ));
VariantContext subvc = vc.subContextFromSamples(samplesToPhase); VariantContext subvc = vc.subContextFromSamples(samplesToPhase, true);
// logger.debug("original VC = " + vc); // logger.debug("original VC = " + vc);
// logger.debug("sub VC = " + subvc); // logger.debug("sub VC = " + subvc);
return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF); return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF);
@ -374,7 +371,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
if (isUnfilteredCalledDiploidGenotype(gt)) { if (isUnfilteredCalledDiploidGenotype(gt)) {
if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site: if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site:
// true <-> can trivially phase a hom site relative to ANY previous site: // true <-> can trivially phase a hom site relative to ANY previous site:
Genotype phasedGt = new Genotype(gt.getSampleName(), gt.getAlleles(), gt.getLog10PError(), gt.getFilters(), gt.getAttributes(), true); Genotype phasedGt = new GenotypeBuilder(gt).phased(true).make();
uvc.setGenotype(samp, phasedGt); uvc.setGenotype(samp, phasedGt);
} }
else if (gt.isHet()) { // Attempt to phase this het genotype relative to the previous het genotype else if (gt.isHet()) { // Attempt to phase this het genotype relative to the previous het genotype
@ -408,9 +405,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
if (DEBUG) logger.debug("THE PHASE CHOSEN HERE:\n" + allelePair + "\n\n"); if (DEBUG) logger.debug("THE PHASE CHOSEN HERE:\n" + allelePair + "\n\n");
ensurePhasing(allelePair, prevAllelePair, pr.haplotype); ensurePhasing(allelePair, prevAllelePair, pr.haplotype);
Map<String, Object> gtAttribs = new HashMap<String, Object>(gt.getAttributes()); Genotype phasedGt = new GenotypeBuilder(gt)
gtAttribs.put(PQ_KEY, pr.phaseQuality); .alleles(allelePair.getAllelesAsList())
Genotype phasedGt = new Genotype(gt.getSampleName(), allelePair.getAllelesAsList(), gt.getLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased); .attribute(PQ_KEY, pr.phaseQuality)
.phased(genotypesArePhased).make();
uvc.setGenotype(samp, phasedGt); uvc.setGenotype(samp, phasedGt);
} }
@ -428,9 +426,9 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
interiorUvc.setPhasingInconsistent(); interiorUvc.setPhasingInconsistent();
if (genotypesArePhased) { if (genotypesArePhased) {
Map<String, Object> handledGtAttribs = new HashMap<String, Object>(handledGt.getAttributes()); Genotype phasedHomGt = new GenotypeBuilder(handledGt)
handledGtAttribs.put(PQ_KEY, pr.phaseQuality); .attribute(PQ_KEY, pr.phaseQuality)
Genotype phasedHomGt = new Genotype(handledGt.getSampleName(), handledGt.getAlleles(), handledGt.getLog10PError(), handledGt.getFilters(), handledGtAttribs, genotypesArePhased); .phased(genotypesArePhased).make();
interiorUvc.setGenotype(samp, phasedHomGt); interiorUvc.setGenotype(samp, phasedHomGt);
} }
} }
@ -1106,10 +1104,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
this.sampleReadBases = new HashMap<String, ReadBasesAtPosition>(); this.sampleReadBases = new HashMap<String, ReadBasesAtPosition>();
if (alignment != null) { if (alignment != null) {
ReadBackedPileup pileup = null; ReadBackedPileup pileup = alignment.getBasePileup();
if (alignment.hasBasePileup()) {
pileup = alignment.getBasePileup();
}
if (pileup != null) { if (pileup != null) {
// filter the read-base pileup based on min base and mapping qualities: // filter the read-base pileup based on min base and mapping qualities:
pileup = pileup.getBaseAndMappingFilteredPileup(MIN_BASE_QUALITY_SCORE, MIN_MAPPING_QUALITY_SCORE); pileup = pileup.getBaseAndMappingFilteredPileup(MIN_BASE_QUALITY_SCORE, MIN_MAPPING_QUALITY_SCORE);
@ -1439,7 +1434,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
} }
public static boolean isUnfilteredCalledDiploidGenotype(Genotype gt) { public static boolean isUnfilteredCalledDiploidGenotype(Genotype gt) {
return (gt.isNotFiltered() && gt.isCalled() && gt.getPloidy() == 2); return (! gt.isFiltered() && gt.isCalled() && gt.getPloidy() == 2);
} }
private class MultipleBaseCountsWriter { private class MultipleBaseCountsWriter {

View File

@ -365,7 +365,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
return counter; return counter;
// Do not operate on variants that are not covered to the optional minimum depth // Do not operate on variants that are not covered to the optional minimum depth
if (!context.hasReads() || !context.hasBasePileup() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) { if (!context.hasReads() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) {
counter.nUncovered = 1L; counter.nUncovered = 1L;
if (vcComp.getAttribute("GV").equals("T")) if (vcComp.getAttribute("GV").equals("T"))
counter.nAltNotCalled = 1L; counter.nAltNotCalled = 1L;
@ -423,7 +423,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
} }
} }
else { else {
// if (!vcComp.hasAttribute("GV")) // if (!vcComp.hasExtendedAttribute("GV"))
// throw new UserException.BadInput("Variant has no GV annotation in the INFO field. " + vcComp.getChr() + ":" + vcComp.getStart()); // throw new UserException.BadInput("Variant has no GV annotation in the INFO field. " + vcComp.getChr() + ":" + vcComp.getStart());
if (call.isCalledAlt(callConf)) { if (call.isCalledAlt(callConf)) {

View File

@ -43,7 +43,7 @@ public class GLBasedSampleSelector extends SampleSelector {
return true; return true;
// want to include a site in the given samples if it is *likely* to be variant (via the EXACT model) // want to include a site in the given samples if it is *likely* to be variant (via the EXACT model)
// first subset to the samples // first subset to the samples
VariantContext subContext = vc.subContextFromSamples(samples); VariantContext subContext = vc.subContextFromSamples(samples, true);
// now check to see (using EXACT model) whether this should be variant // now check to see (using EXACT model) whether this should be variant
// do we want to apply a prior? maybe user-spec? // do we want to apply a prior? maybe user-spec?

View File

@ -45,7 +45,7 @@ public class GTBasedSampleSelector extends SampleSelector{
if ( samples == null || samples.isEmpty() ) if ( samples == null || samples.isEmpty() )
return true; return true;
VariantContext subContext = vc.subContextFromSamples(samples, vc.getAlleles()); VariantContext subContext = vc.subContextFromSamples(samples, false);
if ( subContext.isPolymorphicInSamples() ) { if ( subContext.isPolymorphicInSamples() ) {
return true; return true;
} }

View File

@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Molten; import org.broadinstitute.sting.gatk.walkers.varianteval.util.Molten;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeType;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*; import java.util.*;
@ -54,7 +55,7 @@ public class GenotypeConcordance extends VariantEvaluator {
* Initialize this object * Initialize this object
*/ */
public GenotypeConcordance() { public GenotypeConcordance() {
final int nGenotypeTypes = Genotype.Type.values().length; final int nGenotypeTypes = GenotypeType.values().length;
truthByCalledGenotypeCounts = new long[nGenotypeTypes][nGenotypeTypes]; truthByCalledGenotypeCounts = new long[nGenotypeTypes][nGenotypeTypes];
} }
@ -75,11 +76,11 @@ public class GenotypeConcordance extends VariantEvaluator {
if (eval != null) { if (eval != null) {
for (final Genotype g : eval.getGenotypes() ) { for (final Genotype g : eval.getGenotypes() ) {
final String sample = g.getSampleName(); final String sample = g.getSampleName();
final Genotype.Type called = g.getType(); final GenotypeType called = g.getType();
final Genotype.Type truth; final GenotypeType truth;
if (!validationIsValidVC || !validation.hasGenotype(sample)) { if (!validationIsValidVC || !validation.hasGenotype(sample)) {
truth = Genotype.Type.NO_CALL; truth = GenotypeType.NO_CALL;
} else { } else {
truth = validation.getGenotype(sample).getType(); truth = validation.getGenotype(sample).getType();
} }
@ -90,19 +91,19 @@ public class GenotypeConcordance extends VariantEvaluator {
// otherwise, mark no-calls for all samples // otherwise, mark no-calls for all samples
else { else {
final Genotype.Type called = Genotype.Type.NO_CALL; final GenotypeType called = GenotypeType.NO_CALL;
for (final Genotype g : validation.getGenotypes()) { for (final Genotype g : validation.getGenotypes()) {
final Genotype.Type truth = g.getType(); final GenotypeType truth = g.getType();
incrValue(truth, called); incrValue(truth, called);
// print out interesting sites // print out interesting sites
/* /*
if ( PRINT_INTERESTING_SITES && super.getVEWalker().gcLog != null ) { if ( PRINT_INTERESTING_SITES && super.getVEWalker().gcLog != null ) {
if ( (truth == Genotype.Type.HOM_VAR || truth == Genotype.Type.HET) && called == Genotype.Type.NO_CALL ) { if ( (truth == GenotypeType.HOM_VAR || truth == GenotypeType.HET) && called == GenotypeType.NO_CALL ) {
super.getVEWalker().gcLog.printf("%s FN %s%n", group, validation); super.getVEWalker().gcLog.printf("%s FN %s%n", group, validation);
} }
if ( (called == Genotype.Type.HOM_VAR || called == Genotype.Type.HET) && truth == Genotype.Type.HOM_REF ) { if ( (called == GenotypeType.HOM_VAR || called == GenotypeType.HET) && truth == GenotypeType.HOM_REF ) {
super.getVEWalker().gcLog.printf("%s FP %s%n", group, validation); super.getVEWalker().gcLog.printf("%s FP %s%n", group, validation);
} }
} }
@ -121,36 +122,36 @@ public class GenotypeConcordance extends VariantEvaluator {
* @param truth the truth type * @param truth the truth type
* @param called the called type * @param called the called type
*/ */
private void incrValue(final Genotype.Type truth, final Genotype.Type called) { private void incrValue(final GenotypeType truth, final GenotypeType called) {
truthByCalledGenotypeCounts[truth.ordinal()][called.ordinal()]++; truthByCalledGenotypeCounts[truth.ordinal()][called.ordinal()]++;
} }
private long count(final Genotype.Type truth, final Genotype.Type called) { private long count(final GenotypeType truth, final GenotypeType called) {
return truthByCalledGenotypeCounts[truth.ordinal()][called.ordinal()]; return truthByCalledGenotypeCounts[truth.ordinal()][called.ordinal()];
} }
private long count(final EnumSet<Genotype.Type> truth, final Genotype.Type called) { private long count(final EnumSet<GenotypeType> truth, final GenotypeType called) {
return count(truth, EnumSet.of(called)); return count(truth, EnumSet.of(called));
} }
private long count(final Genotype.Type truth, final EnumSet<Genotype.Type> called) { private long count(final GenotypeType truth, final EnumSet<GenotypeType> called) {
return count(EnumSet.of(truth), called); return count(EnumSet.of(truth), called);
} }
private long count(final EnumSet<Genotype.Type> truth, final EnumSet<Genotype.Type> called) { private long count(final EnumSet<GenotypeType> truth, final EnumSet<GenotypeType> called) {
long sum = 0; long sum = 0;
for ( final Genotype.Type truth1 : truth ) { for ( final GenotypeType truth1 : truth ) {
for ( final Genotype.Type called1 : called ) { for ( final GenotypeType called1 : called ) {
sum += count(truth1, called1); sum += count(truth1, called1);
} }
} }
return sum; return sum;
} }
private long countDiag( final EnumSet<Genotype.Type> d1 ) { private long countDiag( final EnumSet<GenotypeType> d1 ) {
long sum = 0; long sum = 0;
for(final Genotype.Type e1 : d1 ) { for(final GenotypeType e1 : d1 ) {
sum += truthByCalledGenotypeCounts[e1.ordinal()][e1.ordinal()]; sum += truthByCalledGenotypeCounts[e1.ordinal()][e1.ordinal()];
} }
@ -159,13 +160,13 @@ public class GenotypeConcordance extends VariantEvaluator {
@Override @Override
public void finalizeEvaluation() { public void finalizeEvaluation() {
final EnumSet<Genotype.Type> allVariantGenotypes = EnumSet.of(Genotype.Type.HOM_VAR, Genotype.Type.HET); final EnumSet<GenotypeType> allVariantGenotypes = EnumSet.of(GenotypeType.HOM_VAR, GenotypeType.HET);
final EnumSet<Genotype.Type> allCalledGenotypes = EnumSet.of(Genotype.Type.HOM_VAR, Genotype.Type.HET, Genotype.Type.HOM_REF); final EnumSet<GenotypeType> allCalledGenotypes = EnumSet.of(GenotypeType.HOM_VAR, GenotypeType.HET, GenotypeType.HOM_REF);
final EnumSet<Genotype.Type> allGenotypes = EnumSet.allOf(Genotype.Type.class); final EnumSet<GenotypeType> allGenotypes = EnumSet.allOf(GenotypeType.class);
// exact values of the table // exact values of the table
for ( final Genotype.Type truth : Genotype.Type.values() ) { for ( final GenotypeType truth : GenotypeType.values() ) {
for ( final Genotype.Type called : Genotype.Type.values() ) { for ( final GenotypeType called : GenotypeType.values() ) {
final String field = String.format("n_true_%s_called_%s", truth, called); final String field = String.format("n_true_%s_called_%s", truth, called);
final Long value = count(truth, called); final Long value = count(truth, called);
map.put(field, value.toString()); map.put(field, value.toString());
@ -173,20 +174,20 @@ public class GenotypeConcordance extends VariantEvaluator {
} }
// counts of called genotypes // counts of called genotypes
for ( final Genotype.Type called : Genotype.Type.values() ) { for ( final GenotypeType called : GenotypeType.values() ) {
final String field = String.format("total_called_%s", called); final String field = String.format("total_called_%s", called);
final Long value = count(allGenotypes, called); final Long value = count(allGenotypes, called);
map.put(field, value.toString()); map.put(field, value.toString());
} }
// counts of true genotypes // counts of true genotypes
for ( final Genotype.Type truth : Genotype.Type.values() ) { for ( final GenotypeType truth : GenotypeType.values() ) {
final String field = String.format("total_true_%s", truth); final String field = String.format("total_true_%s", truth);
final Long value = count(truth, allGenotypes); final Long value = count(truth, allGenotypes);
map.put(field, value.toString()); map.put(field, value.toString());
} }
for ( final Genotype.Type genotype : Genotype.Type.values() ) { for ( final GenotypeType genotype : GenotypeType.values() ) {
final String field = String.format("percent_%s_called_%s", genotype, genotype); final String field = String.format("percent_%s_called_%s", genotype, genotype);
long numer = count(genotype, genotype); long numer = count(genotype, genotype);
long denom = count(EnumSet.of(genotype), allGenotypes); long denom = count(EnumSet.of(genotype), allGenotypes);
@ -215,7 +216,7 @@ public class GenotypeConcordance extends VariantEvaluator {
// overall genotype concordance of sites called non-ref in eval track // overall genotype concordance of sites called non-ref in eval track
// MAD: this is the non-reference discrepancy rate // MAD: this is the non-reference discrepancy rate
final String field = "percent_non_reference_discrepancy_rate"; final String field = "percent_non_reference_discrepancy_rate";
long homrefConcords = count(Genotype.Type.HOM_REF, Genotype.Type.HOM_REF); long homrefConcords = count(GenotypeType.HOM_REF, GenotypeType.HOM_REF);
long allNoHomRef = count(allCalledGenotypes, allCalledGenotypes) - homrefConcords; long allNoHomRef = count(allCalledGenotypes, allCalledGenotypes) - homrefConcords;
long numer = allNoHomRef - countDiag(allVariantGenotypes); long numer = allNoHomRef - countDiag(allVariantGenotypes);
long denom = count(allCalledGenotypes, allCalledGenotypes) - homrefConcords; long denom = count(allCalledGenotypes, allCalledGenotypes) - homrefConcords;

View File

@ -121,9 +121,9 @@ public class ValidationReport extends VariantEvaluator implements StandardEval {
int ac = 0; int ac = 0;
if ( vc.getNAlleles() > 2 ) { if ( vc.getNAlleles() > 2 ) {
return SiteStatus.POLY; return SiteStatus.POLY;
//// System.out.printf("multiple alleles %s = %s%n", vc.getAlleles(), vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); //// System.out.printf("multiple alleles %s = %s%n", vc.getAlleles(), vc.getExtendedAttribute(VCFConstants.ALLELE_COUNT_KEY));
// // todo -- omg this is painful. We need a better approach to dealing with multi-valued attributes // // todo -- omg this is painful. We need a better approach to dealing with multi-valued attributes
// for ( String v : (List<String>)vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY) ) // for ( String v : (List<String>)vc.getExtendedAttribute(VCFConstants.ALLELE_COUNT_KEY) )
// ac += Integer.valueOf(v); // ac += Integer.valueOf(v);
//// System.out.printf(" ac = %d%n", ac); //// System.out.printf(" ac = %d%n", ac);
} }

View File

@ -241,7 +241,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
// update transition / transversion ratio // update transition / transversion ratio
if ( titvTable != null ) titvTable.inc(type, g.getSampleName()); if ( titvTable != null ) titvTable.inc(type, g.getSampleName());
if ( g.hasAttribute(VCFConstants.DEPTH_KEY) ) if ( g.hasDP() )
depthPerSample.inc(type, g.getSampleName()); depthPerSample.inc(type, g.getSampleName());
} }
} }

View File

@ -199,7 +199,7 @@ public class VariantEvalUtils {
* @return a new VariantContext with just the requested samples * @return a new VariantContext with just the requested samples
*/ */
public VariantContext getSubsetOfVariantContext(VariantContext vc, Set<String> sampleNames) { public VariantContext getSubsetOfVariantContext(VariantContext vc, Set<String> sampleNames) {
VariantContext vcsub = vc.subContextFromSamples(sampleNames, vc.getAlleles()); VariantContext vcsub = vc.subContextFromSamples(sampleNames, false);
VariantContextBuilder builder = new VariantContextBuilder(vcsub); VariantContextBuilder builder = new VariantContextBuilder(vcsub);
final int originalAlleleCount = vc.getHetCount() + 2 * vc.getHomVarCount(); final int originalAlleleCount = vc.getHetCount() + 2 * vc.getHomVarCount();

View File

@ -223,7 +223,7 @@ public class LeftAlignVariants extends RodWalker<Integer, Integer> {
newA = Allele.NO_CALL; newA = Allele.NO_CALL;
newAlleles.add(newA); newAlleles.add(newA);
} }
newGenotypes.add(Genotype.modifyAlleles(genotype, newAlleles)); newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make());
} }
return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).referenceBaseForIndel(refBaseForIndel).make(); return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).referenceBaseForIndel(refBaseForIndel).make();

View File

@ -315,6 +315,15 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="fullyDecode", doc="If true, the incoming VariantContext will be fully decoded", required=false) @Argument(fullName="fullyDecode", doc="If true, the incoming VariantContext will be fully decoded", required=false)
private boolean fullyDecode = false; private boolean fullyDecode = false;
@Hidden
@Argument(fullName="forceGenotypesDecode", doc="If true, the incoming VariantContext will have its genotypes forcibly decoded by computing AC across all genotypes. For efficiency testing only", required=false)
private boolean forceGenotypesDecode = false;
@Hidden
@Argument(fullName="justRead", doc="If true, we won't actually write the output file. For efficiency testing only", required=false)
private boolean justRead = false;
/* Private class used to store the intermediate variants in the integer random selection process */ /* Private class used to store the intermediate variants in the integer random selection process */
private class RandomVariantStructure { private class RandomVariantStructure {
private VariantContext vc; private VariantContext vc;
@ -392,11 +401,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles); Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
samples.removeAll(XLsamplesFromFile); samples.removeAll(XLsamplesFromFile);
samples.removeAll(XLsampleNames); samples.removeAll(XLsampleNames);
NO_SAMPLES_SPECIFIED = NO_SAMPLES_SPECIFIED && XLsampleNames.isEmpty();
if ( samples.size() == 0 && !NO_SAMPLES_SPECIFIED ) if ( samples.size() == 0 && !NO_SAMPLES_SPECIFIED )
throw new UserException("All samples requested to be included were also requested to be excluded."); throw new UserException("All samples requested to be included were also requested to be excluded.");
for ( String sample : samples ) if ( ! NO_SAMPLES_SPECIFIED )
for ( String sample : samples )
logger.info("Including sample '" + sample + "'"); logger.info("Including sample '" + sample + "'");
// if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
@ -494,7 +505,16 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
} }
for (VariantContext vc : vcs) { for (VariantContext vc : vcs) {
if ( fullyDecode ) vc = vc.fullyDecode(vcfRods.get(vc.getSource())); // an option for performance testing only
if ( fullyDecode )
vc = vc.fullyDecode(vcfRods.get(vc.getSource()));
// an option for performance testing only
if ( forceGenotypesDecode ) {
final int x = vc.getCalledChrCount();
//logger.info("forceGenotypesDecode with getCalledChrCount() = " + );
}
if ( IDsToKeep != null && ! IDsToKeep.contains(vc.getID()) ) if ( IDsToKeep != null && ! IDsToKeep.contains(vc.getID()) )
continue; continue;
@ -538,7 +558,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if (!selectedTypes.contains(vc.getType())) if (!selectedTypes.contains(vc.getType()))
continue; continue;
VariantContext sub = subsetRecord(vc, samples, EXCLUDE_NON_VARIANTS); VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS);
if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) { if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) {
final VariantContextBuilder builder = new VariantContextBuilder(UG_engine.calculateGenotypes(tracker, ref, context, sub)).filters(sub.getFiltersMaybeNull()); final VariantContextBuilder builder = new VariantContextBuilder(UG_engine.calculateGenotypes(tracker, ref, context, sub)).filters(sub.getFiltersMaybeNull());
@ -559,7 +579,8 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
randomlyAddVariant(++variantNumber, sub); randomlyAddVariant(++variantNumber, sub);
} }
else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
vcfWriter.add(sub); if ( ! justRead )
vcfWriter.add(sub);
} }
} }
} }
@ -687,18 +708,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF). * Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF).
* *
* @param vc the VariantContext record to subset * @param vc the VariantContext record to subset
* @param samples the samples to extract
* @return the subsetted VariantContext * @return the subsetted VariantContext
*/ */
private VariantContext subsetRecord(final VariantContext vc, final Set<String> samples, final boolean excludeNonVariants) { private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants) {
if ( samples == null || samples.isEmpty() ) if ( NO_SAMPLES_SPECIFIED || samples.isEmpty() )
return vc; return vc;
final VariantContext sub; final VariantContext sub = vc.subContextFromSamples(samples, excludeNonVariants); // strip out the alternate alleles that aren't being used
if ( excludeNonVariants )
sub = vc.subContextFromSamples(samples); // strip out the alternate alleles that aren't being used
else
sub = vc.subContextFromSamples(samples, vc.getAlleles());
VariantContextBuilder builder = new VariantContextBuilder(sub); VariantContextBuilder builder = new VariantContextBuilder(sub);
GenotypesContext newGC = sub.getGenotypes(); GenotypesContext newGC = sub.getGenotypes();
@ -708,15 +725,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
newGC = VariantContextUtils.stripPLs(sub.getGenotypes()); newGC = VariantContextUtils.stripPLs(sub.getGenotypes());
//Remove a fraction of the genotypes if needed //Remove a fraction of the genotypes if needed
if(fractionGenotypes>0){ if ( fractionGenotypes > 0 ){
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(); ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
for ( Genotype genotype : newGC ) { for ( Genotype genotype : newGC ) {
//Set genotype to no call if it falls in the fraction. //Set genotype to no call if it falls in the fraction.
if(fractionGenotypes>0 && randomGenotypes.nextDouble()<fractionGenotypes){ if(fractionGenotypes>0 && randomGenotypes.nextDouble()<fractionGenotypes){
ArrayList<Allele> alleles = new ArrayList<Allele>(2); List<Allele> alleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
alleles.add(Allele.create((byte)'.')); genotypes.add(new GenotypeBuilder(genotype).alleles(alleles).noGQ().make());
alleles.add(Allele.create((byte)'.'));
genotypes.add(new Genotype(genotype.getSampleName(),alleles, Genotype.NO_LOG10_PERROR,genotype.getFilters(),new HashMap<String, Object>(),false));
} }
else{ else{
genotypes.add(genotype); genotypes.add(genotype);
@ -750,14 +765,12 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
for (String sample : originalVC.getSampleNames()) { for (String sample : originalVC.getSampleNames()) {
Genotype g = originalVC.getGenotype(sample); Genotype g = originalVC.getGenotype(sample);
if ( g.isNotFiltered() ) { if ( ! g.isFiltered() ) {
if ( g.hasDP() )
String dp = (String) g.getAttribute("DP"); depth += g.getDP();
if (dp != null && ! dp.equals(VCFConstants.MISSING_DEPTH_v3) && ! dp.equals(VCFConstants.MISSING_VALUE_v4) ) {
depth += Integer.valueOf(dp);
}
} }
} }
builder.attribute("DP", depth); builder.attribute("DP", depth);
} }

View File

@ -288,8 +288,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private byte getStandardEncoding(Genotype g, int offset) { private byte getStandardEncoding(Genotype g, int offset) {
byte b; byte b;
if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) { if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) {
b = NO_CALL; b = NO_CALL;
} else if ( g.isHomRef() ) { } else if ( g.isHomRef() ) {
b = HOM_REF; b = HOM_REF;
} else if ( g.isHomVar() ) { } else if ( g.isHomVar() ) {
@ -305,7 +305,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private byte getFlippedEncoding(Genotype g, int offset) { private byte getFlippedEncoding(Genotype g, int offset) {
byte b; byte b;
if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) { if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) {
b = NO_CALL; b = NO_CALL;
} else if ( g.isHomRef() ) { } else if ( g.isHomRef() ) {
b = HOM_VAR; b = HOM_VAR;

View File

@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Allele;
@ -314,8 +315,12 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
if ( addGenotypeFields ) { if ( addGenotypeFields ) {
for ( final String sample : samples ) { for ( final String sample : samples ) {
for ( final String gf : genotypeFields ) { for ( final String gf : genotypeFields ) {
if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAttribute(gf) ) if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAnyAttribute(gf) ) {
addFieldValue(vc.getGenotype(sample).getAttribute(gf), records); if ( gf.equals(VCFConstants.GENOTYPE_KEY) )
addFieldValue(vc.getGenotype(sample).getGenotypeString(true), records);
else
addFieldValue(vc.getGenotype(sample).getAnyAttribute(gf), records);
}
else else
addFieldValue(MISSING_DATA, records); addFieldValue(MISSING_DATA, records);
} }

View File

@ -132,7 +132,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
// set the appropriate sample name if necessary // set the appropriate sample name if necessary
if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(variants.getName()) ) { if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(variants.getName()) ) {
Genotype g = Genotype.modifyName(vc.getGenotype(variants.getName()), sampleName); Genotype g = new GenotypeBuilder(vc.getGenotype(variants.getName())).name(sampleName).make();
builder.genotypes(g); builder.genotypes(g);
} }

View File

@ -1,9 +1,5 @@
package org.broadinstitute.sting.utils; package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.ByteArrayOutputStream;
import java.io.ObjectOutputStream;
import java.util.BitSet; import java.util.BitSet;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
@ -16,10 +12,8 @@ import java.util.Map;
*/ */
public class BitSetUtils { public class BitSetUtils {
static final private int MAX_DNA_CONTEXT = 31; // the maximum context size (number of bases) permitted in the "long bitset" implementation of the DNA <=> BitSet conversion.
static final private byte NBITS_LONG_REPRESENTATION = 64; // the number of bits used in the long version to represent the bit set (necessary for the two's complement representation of negative numbers) static final private byte NBITS_LONG_REPRESENTATION = 64; // the number of bits used in the long version to represent the bit set (necessary for the two's complement representation of negative numbers)
static final private byte NBITS_SHORT_REPRESENTATION = 16; // the number of bits used in the short version to represent the bit set (necessary for the two's complement representation of negative numbers) static final private byte NBITS_SHORT_REPRESENTATION = 16; // the number of bits used in the short version to represent the bit set (necessary for the two's complement representation of negative numbers)
static final long[] combinationsPerLength = new long[MAX_DNA_CONTEXT + 1]; // keeps the memoized table with the number of combinations for each given DNA context length
/** /**
* Creates an long out of a bitset * Creates an long out of a bitset
@ -112,173 +106,4 @@ public class BitSetUtils {
} }
return bitSet; return bitSet;
} }
/**
* Converts a BitSet into the dna string representation.
*
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
* a bitSetFrom(BigNumber) method.
*
* We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the
* base_10 representation of the sequence. This is important for us to know how to bring the number
* to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented
* as 0's and leading 0's are omitted).
*
* quasi-canonical because A is represented by a 0, therefore,
* instead of : 0, 1, 2, 3, 10, 11, 12, ...
* we have : 0, 1, 2, 3, 00, 01, 02, ...
*
* but we can correctly decode it because we know the final length.
*
* @param bitSet the bitset representation of the dna sequence
* @return the dna sequence represented by the bitset
*/
public static String dnaFrom(final BitSet bitSet) {
long number = longFrom(bitSet); // the base_10 representation of the bit set
if (number < 0)
throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?");
final int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
StringBuilder dna = new StringBuilder();
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
byte base = (byte) (number % 4);
switch (base) {
case 0:
dna.append('A');
break;
case 1:
dna.append('C');
break;
case 2:
dna.append('G');
break;
case 3:
dna.append('T');
break;
}
number /= 4;
}
for (int j = dna.length(); j < length; j++)
dna.append('A'); // add leading A's as necessary (due to the "quasi" canonical status, see description above)
return dna.reverse().toString(); // make sure to reverse the string since we should have been pre-pending all along
}
/**
* Creates a BitSet representation of a given dna string.
*
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
* a bitSetFrom(BigNumber) method.
*
* The bit representation of a dna string is the simple:
* 0 A 4 AA 8 CA
* 1 C 5 AC ...
* 2 G 6 AG 1343 TTGGT
* 3 T 7 AT 1364 TTTTT
*
* To convert from dna to number, we convert the dna string to base10 and add all combinations that
* preceded the string (with smaller lengths).
*
* @param dna the dna sequence
* @return the bitset representing the dna sequence
*/
public static BitSet bitSetFrom(String dna) {
return bitSetFrom(dna.getBytes());
}
public static BitSet bitSetFrom(final byte[] dna) {
if (dna.length > MAX_DNA_CONTEXT)
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length));
final long preContext = combinationsFor(dna.length - 1); // the sum of all combinations that preceded the length of the dna string
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
for (final byte base : dna) {
baseTen *= 4;
baseTen += BaseUtils.simpleBaseToBaseIndex(base);
}
return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
}
/**
* Calculates the number of bits necessary to represent a given number of elements
*
* @param numberOfElements the number of elements to represent (must be positive)
* @return the number of bits necessary to represent this many elements
*/
public static int numberOfBitsToRepresent(long numberOfElements) {
if (numberOfElements < 0)
throw new ReviewedStingException("Number of elements must be positive: " + numberOfElements);
if (numberOfElements == 1L)
return 1; // special case
int n = 0;
numberOfElements--;
while (numberOfElements > 0) {
numberOfElements = numberOfElements >> 1;
n++;
}
return n;
}
/**
* Calculates the length of the DNA context for a given base 10 number
*
* It is important to know the length given the base 10 number to calculate the number of combinations
* and to disambiguate the "quasi-canonical" state.
*
* This method also calculates the number of combinations as a by-product, but since it memoizes the
* results, a subsequent call to combinationsFor(length) is O(1).
*
* @param number the base 10 representation of the bitset
* @return the length of the DNA context represented by this number
*/
private static int contextLengthFor(long number) {
int length = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet.
long combinations = combinationsFor(length); // the next context (we advance it so we know which one was preceding it).
while (combinations <= number) { // find the length of the dna string (length)
length++;
combinations = combinationsFor(length); // calculate the next context
}
return length;
}
/**
* The sum of all combinations of a context of a given length from length = 0 to length.
*
* Memoized implementation of sum(4^i) , where i=[0,length]
*
* @param length the length of the DNA context
* @return the sum of all combinations leading up to this context length.
*/
private static long combinationsFor(int length) {
if (length > MAX_DNA_CONTEXT)
throw new ReviewedStingException(String.format("Context cannot be longer than %d bases but requested %d.", MAX_DNA_CONTEXT, length));
// only calculate the number of combinations if the table hasn't already cached the value
if (length > 0 && combinationsPerLength[length] == 0) {
long combinations = 0L;
for (int i = 1; i <= length; i++)
combinations += (1L << 2 * i); // add all combinations with 4^i ( 4^i is the same as 2^(2*i) )
combinationsPerLength[length] = combinations;
}
return combinationsPerLength[length];
}
public static byte[] sizeOf(Object obj) throws java.io.IOException
{
ByteArrayOutputStream byteObject = new ByteArrayOutputStream();
ObjectOutputStream objectOutputStream = new ObjectOutputStream(byteObject);
objectOutputStream.writeObject(obj);
objectOutputStream.flush();
objectOutputStream.close();
byteObject.close();
return byteObject.toByteArray();
}
} }

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeType;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*; import java.util.*;
@ -30,7 +31,7 @@ public class MendelianViolation {
private boolean allCalledOnly = true; private boolean allCalledOnly = true;
//Stores occurrences of inheritance //Stores occurrences of inheritance
private EnumMap<Genotype.Type, EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>> inheritance; private EnumMap<GenotypeType, EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>> inheritance;
private int violations_total=0; private int violations_total=0;
@ -74,119 +75,119 @@ public class MendelianViolation {
//Count of HomRef/HomRef/HomRef trios //Count of HomRef/HomRef/HomRef trios
public int getRefRefRef(){ public int getRefRefRef(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF); return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF);
} }
//Count of HomVar/HomVar/HomVar trios //Count of HomVar/HomVar/HomVar trios
public int getVarVarVar(){ public int getVarVarVar(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR); return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR);
} }
//Count of HomRef/HomVar/Het trios //Count of HomRef/HomVar/Het trios
public int getRefVarHet(){ public int getRefVarHet(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET) + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HET) +
inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET); inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HET);
} }
//Count of Het/Het/Het trios //Count of Het/Het/Het trios
public int getHetHetHet(){ public int getHetHetHet(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET); return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET);
} }
//Count of Het/Het/HomRef trios //Count of Het/Het/HomRef trios
public int getHetHetHomRef(){ public int getHetHetHomRef(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF); return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_REF);
} }
//Count of Het/Het/HomVar trios //Count of Het/Het/HomVar trios
public int getHetHetHomVar(){ public int getHetHetHomVar(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR); return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_VAR);
} }
//Count of ref alleles inherited from Het/Het parents (no violation) //Count of ref alleles inherited from Het/Het parents (no violation)
public int getParentsHetHetInheritedRef(){ public int getParentsHetHetInheritedRef(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET) return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET)
+ 2*inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF); + 2*inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_REF);
//return parentsHetHet_childRef; //return parentsHetHet_childRef;
} }
//Count of var alleles inherited from Het/Het parents (no violation) //Count of var alleles inherited from Het/Het parents (no violation)
public int getParentsHetHetInheritedVar(){ public int getParentsHetHetInheritedVar(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET) return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET)
+ 2*inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR); + 2*inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_VAR);
//return parentsHetHet_childVar; //return parentsHetHet_childVar;
} }
//Count of ref alleles inherited from HomRef/Het parents (no violation) //Count of ref alleles inherited from HomRef/Het parents (no violation)
public int getParentsRefHetInheritedRef(){ public int getParentsRefHetInheritedRef(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF) return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HOM_REF)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF); + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF);
//return parentsHomRefHet_childRef; //return parentsHomRefHet_childRef;
} }
//Count of var alleles inherited from HomRef/Het parents (no violation) //Count of var alleles inherited from HomRef/Het parents (no violation)
public int getParentsRefHetInheritedVar(){ public int getParentsRefHetInheritedVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HET) return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HET)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET); + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HET);
//return parentsHomRefHet_childVar; //return parentsHomRefHet_childVar;
} }
//Count of ref alleles inherited from HomVar/Het parents (no violation) //Count of ref alleles inherited from HomVar/Het parents (no violation)
public int getParentsVarHetInheritedRef(){ public int getParentsVarHetInheritedRef(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HET) return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HET)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET); + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HET);
//return parentsHomVarHet_childRef; //return parentsHomVarHet_childRef;
} }
//Count of var alleles inherited from HomVar/Het parents (no violation) //Count of var alleles inherited from HomVar/Het parents (no violation)
public int getParentsVarHetInheritedVar(){ public int getParentsVarHetInheritedVar(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR) return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HOM_VAR)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR); + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR);
//return parentsHomVarHet_childVar; //return parentsHomVarHet_childVar;
} }
//Count of violations of the type HOM_REF/HOM_REF -> HOM_VAR //Count of violations of the type HOM_REF/HOM_REF -> HOM_VAR
public int getParentsRefRefChildVar(){ public int getParentsRefRefChildVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR); return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR);
} }
//Count of violations of the type HOM_REF/HOM_REF -> HET //Count of violations of the type HOM_REF/HOM_REF -> HET
public int getParentsRefRefChildHet(){ public int getParentsRefRefChildHet(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET); return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HET);
} }
//Count of violations of the type HOM_REF/HET -> HOM_VAR //Count of violations of the type HOM_REF/HET -> HOM_VAR
public int getParentsRefHetChildVar(){ public int getParentsRefHetChildVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR) return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HOM_VAR)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR); + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR);
} }
//Count of violations of the type HOM_REF/HOM_VAR -> HOM_VAR //Count of violations of the type HOM_REF/HOM_VAR -> HOM_VAR
public int getParentsRefVarChildVar(){ public int getParentsRefVarChildVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR) return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR)
+ inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR); + inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR);
} }
//Count of violations of the type HOM_REF/HOM_VAR -> HOM_REF //Count of violations of the type HOM_REF/HOM_VAR -> HOM_REF
public int getParentsRefVarChildRef(){ public int getParentsRefVarChildRef(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF) return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF)
+ inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF); + inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF);
} }
//Count of violations of the type HOM_VAR/HET -> HOM_REF //Count of violations of the type HOM_VAR/HET -> HOM_REF
public int getParentsVarHetChildRef(){ public int getParentsVarHetChildRef(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF) return inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF)
+ inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF); + inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HOM_REF);
} }
//Count of violations of the type HOM_VAR/HOM_VAR -> HOM_REF //Count of violations of the type HOM_VAR/HOM_VAR -> HOM_REF
public int getParentsVarVarChildRef(){ public int getParentsVarVarChildRef(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF); return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF);
} }
//Count of violations of the type HOM_VAR/HOM_VAR -> HET //Count of violations of the type HOM_VAR/HOM_VAR -> HET
public int getParentsVarVarChildHet(){ public int getParentsVarVarChildHet(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET); return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HET);
} }
@ -362,12 +363,12 @@ public class MendelianViolation {
private void createInheritanceMap(){ private void createInheritanceMap(){
inheritance = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>>(Genotype.Type.class); inheritance = new EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>>(GenotypeType.class);
for(Genotype.Type mType : Genotype.Type.values()){ for(GenotypeType mType : GenotypeType.values()){
inheritance.put(mType, new EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>(Genotype.Type.class)); inheritance.put(mType, new EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>(GenotypeType.class));
for(Genotype.Type dType : Genotype.Type.values()){ for(GenotypeType dType : GenotypeType.values()){
inheritance.get(mType).put(dType, new EnumMap<Genotype.Type,Integer>(Genotype.Type.class)); inheritance.get(mType).put(dType, new EnumMap<GenotypeType,Integer>(GenotypeType.class));
for(Genotype.Type cType : Genotype.Type.values()){ for(GenotypeType cType : GenotypeType.values()){
inheritance.get(mType).get(dType).put(cType, 0); inheritance.get(mType).get(dType).put(cType, 0);
} }
} }
@ -376,9 +377,9 @@ public class MendelianViolation {
} }
private void clearInheritanceMap(){ private void clearInheritanceMap(){
for(Genotype.Type mType : Genotype.Type.values()){ for(GenotypeType mType : GenotypeType.values()){
for(Genotype.Type dType : Genotype.Type.values()){ for(GenotypeType dType : GenotypeType.values()){
for(Genotype.Type cType : Genotype.Type.values()){ for(GenotypeType cType : GenotypeType.values()){
inheritance.get(mType).get(dType).put(cType, 0); inheritance.get(mType).get(dType).put(cType, 0);
} }
} }

View File

@ -225,9 +225,9 @@ public class SequenceDictionaryUtils {
return false; return false;
// todo -- reenable if we want to be really strict here // todo -- reenable if we want to be really strict here
// if (me.getAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getAttribute(SAMSequenceRecord.MD5_TAG) != null) { // if (me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null) {
// final BigInteger thisMd5 = new BigInteger((String)me.getAttribute(SAMSequenceRecord.MD5_TAG), 16); // final BigInteger thisMd5 = new BigInteger((String)me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16);
// final BigInteger thatMd5 = new BigInteger((String)that.getAttribute(SAMSequenceRecord.MD5_TAG), 16); // final BigInteger thatMd5 = new BigInteger((String)that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16);
// if (!thisMd5.equals(thatMd5)) { // if (!thisMd5.equals(thatMd5)) {
// return false; // return false;
// } // }

View File

@ -223,6 +223,20 @@ public class Utils {
return ret.toString(); return ret.toString();
} }
public static String join(String separator, int[] ints) {
if ( ints == null || ints.length == 0)
return "";
else {
StringBuilder ret = new StringBuilder();
ret.append(ints[0]);
for (int i = 1; i < ints.length; ++i) {
ret.append(separator);
ret.append(ints[i]);
}
return ret.toString();
}
}
/** /**
* Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of * Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of
* elti objects (note there's no actual space between sep and the elti elements). Returns * elti objects (note there's no actual space between sep and the elti elements). Returns

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.codecs.bcf2; package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broad.tribble.Feature; import org.broad.tribble.Feature;
@ -33,9 +35,7 @@ import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.PositionalBufferedStream; import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*; import org.broadinstitute.sting.utils.variantcontext.*;
@ -45,15 +45,45 @@ import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDependentFeatureCodec { /**
* Decode BCF2 files
*/
public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDependentFeatureCodec {
final protected static Logger logger = Logger.getLogger(BCF2Codec.class); final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
private VCFHeader header = null; private VCFHeader header = null;
/**
* Maps offsets (encoded in BCF) into contig names (from header) for the CHROM field
*/
private final ArrayList<String> contigNames = new ArrayList<String>(); private final ArrayList<String> contigNames = new ArrayList<String>();
/**
* Maps header string names (encoded in VCF) into strings found in the BCF header
*
* Initialized when processing the header
*/
private ArrayList<String> dictionary; private ArrayList<String> dictionary;
/**
* Our decoder that reads low-level objects from the BCF2 records
*/
private final BCF2Decoder decoder = new BCF2Decoder(); private final BCF2Decoder decoder = new BCF2Decoder();
private boolean skipGenotypes = false;
/**
* Provides some sanity checking on the header
*/
private final static int MAX_HEADER_SIZE = 0x08000000; private final static int MAX_HEADER_SIZE = 0x08000000;
/**
* Genotype field decoders that are initialized when the header is read
*/
private BCF2GenotypeFieldDecoders gtFieldDecoders = null;
// for error handling
private int recordNo = 0;
private int pos = 0;
// ---------------------------------------------------------------------- // ----------------------------------------------------------------------
// //
// Feature codec interface functions // Feature codec interface functions
@ -62,28 +92,30 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
@Override @Override
public Feature decodeLoc( final PositionalBufferedStream inputStream ) { public Feature decodeLoc( final PositionalBufferedStream inputStream ) {
return decode(inputStream); recordNo++;
// TODO: a less expensive version of decodeLoc() that doesn't use VariantContext final VariantContextBuilder builder = new VariantContextBuilder();
// TODO: very easy -- just decodeSitesBlock, and then skip to end of end of sites block
// TODO: and then skip genotypes block final int sitesBlockSize = decoder.readBlockSize(inputStream);
final int genotypeBlockSize = decoder.readBlockSize(inputStream); // necessary because it's in the stream
decoder.readNextBlock(sitesBlockSize, inputStream);
decodeSiteLoc(builder);
return builder.fullyDecoded(true).make();
} }
@Override @Override
public VariantContext decode( final PositionalBufferedStream inputStream ) { public VariantContext decode( final PositionalBufferedStream inputStream ) {
recordNo++;
final VariantContextBuilder builder = new VariantContextBuilder(); final VariantContextBuilder builder = new VariantContextBuilder();
final int sitesBlockSize = decoder.readBlockSize(inputStream); final int sitesBlockSize = decoder.readBlockSize(inputStream);
final int genotypeBlockSize = decoder.readBlockSize(inputStream); final int genotypeBlockSize = decoder.readBlockSize(inputStream);
decoder.readNextBlock(sitesBlockSize, inputStream); decoder.readNextBlock(sitesBlockSize, inputStream);
final SitesInfoForDecoding info = decodeSitesBlock(builder); decodeSiteLoc(builder);
final SitesInfoForDecoding info = decodeSitesExtendedInfo(builder);
if ( isSkippingGenotypes() ) {
decoder.skipNextBlock(genotypeBlockSize, inputStream);
} else {
decoder.readNextBlock(genotypeBlockSize, inputStream);
decodeGenotypes(info, builder);
}
decoder.readNextBlock(genotypeBlockSize, inputStream);
createLazyGenotypesDecoder(info, builder);
return builder.fullyDecoded(true).make(); return builder.fullyDecoded(true).make();
} }
@ -97,16 +129,16 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
try { try {
// note that this reads the magic as well, and so does double duty // note that this reads the magic as well, and so does double duty
if ( ! BCF2Utils.startsWithBCF2Magic(inputStream) ) if ( ! BCF2Utils.startsWithBCF2Magic(inputStream) )
throw new UserException.MalformedBCF2("Input stream does not begin with BCF2 magic"); error("Input stream does not begin with BCF2 magic");
final int headerSizeInBytes = BCF2Utils.readInt(BCF2Type.INT32.getSizeInBytes(), inputStream); final int headerSizeInBytes = BCF2Utils.readInt(BCF2Type.INT32.getSizeInBytes(), inputStream);
if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB
throw new UserException.MalformedBCF2("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE); error("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE);
final byte[] headerBytes = new byte[headerSizeInBytes]; final byte[] headerBytes = new byte[headerSizeInBytes];
if ( inputStream.read(headerBytes) != headerSizeInBytes ) if ( inputStream.read(headerBytes) != headerSizeInBytes )
throw new UserException.MalformedBCF2("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes); error("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes);
final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes)); final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes));
final AsciiLineReader headerReader = new AsciiLineReader(bps); final AsciiLineReader headerReader = new AsciiLineReader(bps);
@ -118,12 +150,24 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
} }
// create the config offsets // create the config offsets
for ( final VCFContigHeaderLine contig : header.getContigLines()) if ( ! header.getContigLines().isEmpty() ) {
contigNames.add(contig.getID()); logger.info("Found contig lines in BCF2 file, using those");
contigNames.clear();
for ( final VCFContigHeaderLine contig : header.getContigLines()) {
if ( contig.getID() == null || contig.getID().equals("") )
error("found a contig with an invalid ID " + contig);
contigNames.add(contig.getID());
}
} else {
logger.info("Didn't find any contig lines in BCF2 file, falling back (dangerously) to GATK reference dictionary");
}
// create the string dictionary // create the string dictionary
dictionary = parseDictionary(header); dictionary = parseDictionary(header);
// prepare the genotype field decoders
gtFieldDecoders = new BCF2GenotypeFieldDecoders(header);
// position right before next line (would be right before first real record byte at end of header) // position right before next line (would be right before first real record byte at end of header)
return new FeatureCodecHeader(header, inputStream.getPosition()); return new FeatureCodecHeader(header, inputStream.getPosition());
} }
@ -153,7 +197,6 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
// //
// -------------------------------------------------------------------------------- // --------------------------------------------------------------------------------
@Override @Override
public void setGenomeLocParser(final GenomeLocParser genomeLocParser) { public void setGenomeLocParser(final GenomeLocParser genomeLocParser) {
// initialize contigNames to standard ones in reference // initialize contigNames to standard ones in reference
@ -161,14 +204,6 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
contigNames.add(contig.getSequenceName()); contigNames.add(contig.getSequenceName());
} }
public boolean isSkippingGenotypes() {
return skipGenotypes;
}
public void setSkipGenotypes(final boolean skipGenotypes) {
this.skipGenotypes = skipGenotypes;
}
// -------------------------------------------------------------------------------- // --------------------------------------------------------------------------------
// //
// implicit block // implicit block
@ -182,50 +217,83 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
// //
// -------------------------------------------------------------------------------- // --------------------------------------------------------------------------------
private final SitesInfoForDecoding decodeSitesBlock(final VariantContextBuilder builder) { /**
final int contigOffset = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); * Decode the sites level data from this classes decoder
*
* @param builder
* @return
*/
@Requires({"builder != null"})
private final void decodeSiteLoc(final VariantContextBuilder builder) {
final int contigOffset = decoder.decodeInt(BCF2Type.INT32);
final String contig = lookupContigName(contigOffset); final String contig = lookupContigName(contigOffset);
builder.chr(contig); builder.chr(contig);
final int pos = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); this.pos = decoder.decodeInt(BCF2Type.INT32);
final int refLength = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); final int refLength = decoder.decodeInt(BCF2Type.INT32);
builder.start((long)pos); builder.start((long)pos);
builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals
}
/**
* Decode the sites level data from this classes decoder
*
* @param builder
* @return
*/
@Requires({"builder != null", "decoder != null"})
@Ensures({"result != null", "result.isValid()"})
private final SitesInfoForDecoding decodeSitesExtendedInfo(final VariantContextBuilder builder) {
final Object qual = decoder.decodeSingleValue(BCF2Type.FLOAT); final Object qual = decoder.decodeSingleValue(BCF2Type.FLOAT);
if ( qual != null ) { if ( qual != null ) {
builder.log10PError(((Double)qual) / -10.0); builder.log10PError(((Double)qual) / -10.0);
} }
final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32);
final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes()); final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32);
final int nAlleles = nAlleleInfo >> 16; final int nAlleles = nAlleleInfo >> 16;
final int nInfo = nAlleleInfo & 0x00FF; final int nInfo = nAlleleInfo & 0x0000FFFF;
final int nFormatFields = nFormatSamples >> 24; final int nFormatFields = nFormatSamples >> 24;
final int nSamples = nFormatSamples & 0x0FFF; final int nSamples = nFormatSamples & 0x00FFFFF;
decodeID(builder); decodeID(builder);
final ArrayList<Allele> alleles = decodeAlleles(builder, pos, nAlleles); final ArrayList<Allele> alleles = decodeAlleles(builder, pos, nAlleles);
decodeFilter(builder); decodeFilter(builder);
decodeInfo(builder, nInfo); decodeInfo(builder, nInfo);
return new SitesInfoForDecoding(pos, nFormatFields, nSamples, alleles); final SitesInfoForDecoding info = new SitesInfoForDecoding(nFormatFields, nSamples, alleles);
if ( ! info.isValid() )
error("Sites info is malformed: " + info);
return info;
} }
private final static class SitesInfoForDecoding { protected final static class SitesInfoForDecoding {
final int pos;
final int nFormatFields; final int nFormatFields;
final int nSamples; final int nSamples;
final ArrayList<Allele> alleles; final ArrayList<Allele> alleles;
private SitesInfoForDecoding(final int pos, final int nFormatFields, final int nSamples, final ArrayList<Allele> alleles) { private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final ArrayList<Allele> alleles) {
this.pos = pos;
this.nFormatFields = nFormatFields; this.nFormatFields = nFormatFields;
this.nSamples = nSamples; this.nSamples = nSamples;
this.alleles = alleles; this.alleles = alleles;
} }
public boolean isValid() {
return nFormatFields >= 0 &&
nSamples >= 0 &&
alleles != null && ! alleles.isEmpty() && alleles.get(0).isReference();
}
@Override
public String toString() {
return String.format("nFormatFields = %d, nSamples = %d, alleles = %s", nFormatFields, nSamples, alleles);
}
} }
/**
* Decode the id field in this BCF2 file and store it in the builder
* @param builder
*/
private void decodeID( final VariantContextBuilder builder ) { private void decodeID( final VariantContextBuilder builder ) {
final String id = (String)decoder.decodeTypedValue(); final String id = (String)decoder.decodeTypedValue();
@ -235,6 +303,15 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
builder.id(id); builder.id(id);
} }
/**
* Annoying routine that deals with allele clipping from the BCF2 encoding to the standard
* GATK encoding.
*
* @param position
* @param ref
* @param unclippedAlleles
* @return
*/
protected static ArrayList<Allele> clipAllelesIfNecessary(int position, String ref, ArrayList<Allele> unclippedAlleles) { protected static ArrayList<Allele> clipAllelesIfNecessary(int position, String ref, ArrayList<Allele> unclippedAlleles) {
if ( ! AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) { if ( ! AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) {
ArrayList<Allele> clippedAlleles = new ArrayList<Allele>(unclippedAlleles.size()); ArrayList<Allele> clippedAlleles = new ArrayList<Allele>(unclippedAlleles.size());
@ -244,6 +321,14 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
return unclippedAlleles; return unclippedAlleles;
} }
/**
* Decode the alleles from this BCF2 file and put the results in builder
* @param builder
* @param pos
* @param nAlleles
* @return the alleles
*/
@Requires("nAlleles > 0")
private ArrayList<Allele> decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) { private ArrayList<Allele> decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) {
// TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes // TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes
ArrayList<Allele> alleles = new ArrayList<Allele>(nAlleles); ArrayList<Allele> alleles = new ArrayList<Allele>(nAlleles);
@ -259,15 +344,21 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
alleles.add(Allele.create(allele, false)); alleles.add(Allele.create(allele, false));
} }
} }
assert ref != null;
alleles = clipAllelesIfNecessary(pos, ref, alleles); alleles = clipAllelesIfNecessary(pos, ref, alleles);
builder.alleles(alleles); builder.alleles(alleles);
assert ref.length() > 0;
builder.referenceBaseForIndel(ref.getBytes()[0]); builder.referenceBaseForIndel(ref.getBytes()[0]);
return alleles; return alleles;
} }
/**
* Decode the filter field of this BCF2 file and store the result in the builder
* @param builder
*/
private void decodeFilter( final VariantContextBuilder builder ) { private void decodeFilter( final VariantContextBuilder builder ) {
final Object value = decoder.decodeTypedValue(); final Object value = decoder.decodeTypedValue();
@ -275,17 +366,28 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
builder.unfiltered(); builder.unfiltered();
else { else {
if ( value instanceof Integer ) if ( value instanceof Integer )
// fast path for single integer result
builder.filter(getDictionaryString((Integer)value)); builder.filter(getDictionaryString((Integer)value));
else { else {
for ( int offset : (List<Integer>)value ) for ( final int offset : (List<Integer>)value )
builder.filter(getDictionaryString(offset)); builder.filter(getDictionaryString(offset));
} }
} }
} }
/**
* Loop over the info field key / value pairs in this BCF2 file and decode them into the builder
*
* @param builder
* @param numInfoFields
*/
@Requires("numInfoFields >= 0")
private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) { private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) {
final Map<String, Object> infoFieldEntries = new HashMap<String, Object>(numInfoFields); if ( numInfoFields == 0 )
// fast path, don't bother doing any work if there are no fields
return;
final Map<String, Object> infoFieldEntries = new HashMap<String, Object>(numInfoFields);
for ( int i = 0; i < numInfoFields; i++ ) { for ( int i = 0; i < numInfoFields; i++ ) {
final String key = getDictionaryString(); final String key = getDictionaryString();
Object value = decoder.decodeTypedValue(); Object value = decoder.decodeTypedValue();
@ -297,143 +399,98 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
builder.attributes(infoFieldEntries); builder.attributes(infoFieldEntries);
} }
private void decodeGenotypes( final SitesInfoForDecoding siteInfo, final VariantContextBuilder builder ) { // --------------------------------------------------------------------------------
final List<String> samples = new ArrayList<String>(header.getGenotypeSamples()); //
final int nSamples = siteInfo.nSamples; // Decoding Genotypes
final int nFields = siteInfo.nFormatFields; //
// --------------------------------------------------------------------------------
if ( samples.size() != nSamples ) /**
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " + * Create the lazy loader for the genotypes data, and store it in the builder
"different numbers of samples per record. Saw " + samples.size() + * so that the VC will be able to decode on demand the genotypes data
" samples in header but have a record with " + nSamples + " samples"); *
* @param siteInfo
* @param builder
*/
private void createLazyGenotypesDecoder( final SitesInfoForDecoding siteInfo,
final VariantContextBuilder builder ) {
if (siteInfo.nSamples > 0) {
final LazyGenotypesContext.LazyParser lazyParser =
new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields);
final int nGenotypes = header.getGenotypeSamples().size();
LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser,
new LazyData(siteInfo.nFormatFields, decoder.getRecordBytes()),
nGenotypes);
final Map<String, List<Object>> fieldValues = decodeGenotypeFieldValues(nFields, nSamples); // did we resort the sample names? If so, we need to load the genotype data
final List<Genotype> genotypes = new ArrayList<Genotype>(nSamples); if ( !header.samplesWereAlreadySorted() )
for ( int i = 0; i < nSamples; i++ ) { lazy.decode();
// all of the information we need for each genotype, with default values
final String sampleName = samples.get(i);
List<Allele> alleles = null;
boolean isPhased = false;
double log10PError = VariantContext.NO_LOG10_PERROR;
Set<String> filters = null;
Map<String, Object> attributes = null;
double[] log10Likelihoods = null;
for ( final Map.Entry<String, List<Object>> entry : fieldValues.entrySet() ) { builder.genotypesNoValidation(lazy);
final String field = entry.getKey();
Object value = entry.getValue().get(i);
try {
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
alleles = decodeGenotypeAlleles(siteInfo.alleles, (List<Integer>)value);
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
if ( value != BCF2Type.INT8.getMissingJavaValue() )
log10PError = ((Integer)value) / -10.0;
} else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
final List<Integer> pls = (List<Integer>)value;
if ( pls != null ) { // we have a PL field
log10Likelihoods = new double[pls.size()];
for ( int j = 0; j < log10Likelihoods.length; j++ ) {
final double d = pls.get(j);
log10Likelihoods[j] = d == -0.0 ? 0.0 : d / -10.0;
}
}
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2");
//filters = new HashSet<String>(values.get(i));
} else { // add to attributes
if ( value != null ) { // don't add missing values
if ( attributes == null ) attributes = new HashMap<String, Object>(nFields);
if ( value instanceof List && ((List)value).size() == 1)
value = ((List)value).get(0);
attributes.put(field, value);
}
}
} catch ( ClassCastException e ) {
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
+ " inconsistent with the value observed in the decoded value in the "
+ " BCF file. Value was " + value);
}
}
if ( alleles == null ) throw new UserException.MalformedBCF2("BUG: no alleles found");
final Genotype g = new Genotype(sampleName, alleles, log10PError, filters, attributes, isPhased, log10Likelihoods);
genotypes.add(g);
}
builder.genotypes(genotypes);
}
private final List<Allele> decodeGenotypeAlleles(final ArrayList<Allele> siteAlleles, final List<Integer> encoded) {
if ( encoded == null )
// no called sample GT = .
return Collections.emptyList();
else {
// we have at least some alleles to decode
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
for ( final Integer encode : encoded ) {
if ( encode == null ) // absent, as are all following by definition
return gt;
else {
final int offset = encode >> 1;
if ( offset == 0 )
gt.add(Allele.NO_CALL);
else
gt.add(siteAlleles.get(offset - 1));
}
}
return gt;
} }
} }
private final Map<String, List<Object>> decodeGenotypeFieldValues(final int nFields, final int nSamples) { public static class LazyData {
assert (nFields > 0 && nSamples > 0) || (nFields == 0 && nSamples == 0); final public int nGenotypeFields;
final public byte[] bytes;
if ( nFields == 0 ) // fast path exit for sites only file @Requires({"nGenotypeFields > 0", "bytes != null"})
return Collections.emptyMap(); public LazyData(final int nGenotypeFields, final byte[] bytes) {
else { this.nGenotypeFields = nGenotypeFields;
final Map<String, List<Object>> map = new LinkedHashMap<String, List<Object>>(nFields); this.bytes = bytes;
for ( int i = 0; i < nFields; i++ ) {
final String field = getDictionaryString();
final byte typeDescriptor = decoder.readTypeDescriptor();
final List<Object> values = new ArrayList<Object>(nSamples);
for ( int j = 0; j < nSamples; j++ )
values.add(decoder.decodeTypedValue(typeDescriptor));
map.put(field, values);
}
return map;
} }
} }
@Ensures("result != null")
private final String getDictionaryString() { private final String getDictionaryString() {
return getDictionaryString((Integer) decoder.decodeTypedValue()); return getDictionaryString((Integer) decoder.decodeTypedValue());
} }
private final String getDictionaryString(final int offset) { @Requires("offset < dictionary.size()")
if ( offset >= dictionary.size() ) throw new UserException.MalformedBCF2("BUG: no dictionary field found at offset " + offset); @Ensures("result != null")
final String field = dictionary.get(offset); protected final String getDictionaryString(final int offset) {
return field; return dictionary.get(offset);
} }
/**
* Translate the config offset as encoded in the BCF file into the actual string
* name of the contig from the dictionary
*
* @param contigOffset
* @return
*/
@Requires({"contigOffset >= 0", "contigOffset < contigNames.size()"})
@Ensures("result != null")
private final String lookupContigName( final int contigOffset ) { private final String lookupContigName( final int contigOffset ) {
if ( contigOffset < contigNames.size() ) { return contigNames.get(contigOffset);
return contigNames.get(contigOffset);
}
else {
throw new UserException.MalformedBCF2(String.format("No contig at index %d present in the sequence dictionary from the BCF2 header (%s)", contigOffset, contigNames));
}
} }
@Requires("header != null")
@Ensures({"result != null", "! result.isEmpty()"})
private final ArrayList<String> parseDictionary(final VCFHeader header) { private final ArrayList<String> parseDictionary(final VCFHeader header) {
final ArrayList<String> dict = BCF2Utils.makeDictionary(header); final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
// if we got here we never found a dictionary, or there are no elements in the dictionary // if we got here we never found a dictionary, or there are no elements in the dictionary
if ( dict.size() == 0 ) if ( dict.isEmpty() )
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty"); error("Dictionary header element was absent or empty");
return dict; return dict;
} }
/**
* @return the VCFHeader we found in this BCF2 file
*/
protected VCFHeader getHeader() {
return header;
}
@Requires("field != null")
@Ensures("result != null")
protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String field) {
return gtFieldDecoders.getDecoder(field);
}
private final void error(final String message) throws RuntimeException {
throw new UserException.MalformedBCF2(String.format("At record %d with position %d:", recordNo, pos, message));
}
} }

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.codecs.bcf2; package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broad.tribble.FeatureCodec; import org.broad.tribble.FeatureCodec;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -33,12 +35,13 @@ import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
public class BCF2Decoder { public final class BCF2Decoder {
final protected static Logger logger = Logger.getLogger(FeatureCodec.class); final protected static Logger logger = Logger.getLogger(FeatureCodec.class);
byte[] recordBytes; byte[] recordBytes = null;
ByteArrayInputStream recordStream; ByteArrayInputStream recordStream = null;
public BCF2Decoder() { public BCF2Decoder() {
// nothing to do // nothing to do
@ -66,6 +69,7 @@ public class BCF2Decoder {
* @return * @return
*/ */
public void readNextBlock(final int blockSizeInBytes, final InputStream stream) { public void readNextBlock(final int blockSizeInBytes, final InputStream stream) {
if ( blockSizeInBytes < 0 ) throw new UserException.MalformedBCF2("Invalid block size " + blockSizeInBytes);
setRecordBytes(readRecordBytes(blockSizeInBytes, stream)); setRecordBytes(readRecordBytes(blockSizeInBytes, stream));
} }
@ -112,9 +116,9 @@ public class BCF2Decoder {
* *
* @param recordBytes * @param recordBytes
*/ */
@Requires("recordBytes != null")
@Ensures({"this.recordBytes == recordBytes", "recordStream != null"})
public void setRecordBytes(final byte[] recordBytes) { public void setRecordBytes(final byte[] recordBytes) {
assert recordBytes != null;
this.recordBytes = recordBytes; this.recordBytes = recordBytes;
this.recordStream = new ByteArrayInputStream(recordBytes); this.recordStream = new ByteArrayInputStream(recordBytes);
} }
@ -131,7 +135,7 @@ public class BCF2Decoder {
} }
public final Object decodeTypedValue(final byte typeDescriptor) { public final Object decodeTypedValue(final byte typeDescriptor) {
final int size = BCF2Utils.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : BCF2Utils.decodeSize(typeDescriptor); final int size = decodeNumberOfElements(typeDescriptor);
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
assert size >= 0; assert size >= 0;
@ -155,7 +159,7 @@ public class BCF2Decoder {
public final Object decodeSingleValue(final BCF2Type type) { public final Object decodeSingleValue(final BCF2Type type) {
// TODO -- decodeTypedValue should integrate this routine // TODO -- decodeTypedValue should integrate this routine
final int value = BCF2Utils.readInt(type.getSizeInBytes(), recordStream); final int value = decodeInt(type);
if ( value == type.getMissingBytes() ) if ( value == type.getMissingBytes() )
return null; return null;
@ -184,26 +188,107 @@ public class BCF2Decoder {
final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array
try { try {
recordStream.read(bytes); recordStream.read(bytes);
final String s = new String(bytes);
return BCF2Utils.isCollapsedString(s) ? BCF2Utils.exploreStringList(s) : s; int goodLength = 0;
for ( ; goodLength < bytes.length ; goodLength++ )
if ( bytes[goodLength] == 0 ) break;
if ( goodLength == 0 )
return null;
else {
final String s = new String(bytes, 0, goodLength);
return BCF2Utils.isCollapsedString(s) ? BCF2Utils.exploreStringList(s) : s;
}
} catch ( IOException e ) { } catch ( IOException e ) {
throw new ReviewedStingException("readByte failure", e); throw new ReviewedStingException("readByte failure", e);
} }
} }
private final int decodeVectorSize() { @Ensures("result >= 0")
final byte typeDescriptor = readTypeDescriptor(); public final int decodeNumberOfElements(final byte typeDescriptor) {
final int size = BCF2Utils.decodeSize(typeDescriptor); if ( BCF2Utils.sizeIsOverflow(typeDescriptor) )
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); // -1 ensures we explode immediately with a bad size if the result is missing
return decodeInt(readTypeDescriptor(), -1);
assert size == 1; else
assert type == BCF2Type.INT8 || type == BCF2Type.INT16 || type == BCF2Type.INT32; // the size is inline, so just decode it
return BCF2Utils.decodeSize(typeDescriptor);
return decodeInt(type.getSizeInBytes());
} }
public final int decodeInt(int bytesForEachInt) { /**
return BCF2Utils.readInt(bytesForEachInt, recordStream); * Decode an int from the stream. If the value in the stream is missing,
* returns missingValue. Requires the typeDescriptor indicate an inline
* single element event
*
* @param typeDescriptor
* @return
*/
@Requires("BCF2Utils.decodeSize(typeDescriptor) == 1")
public final int decodeInt(final byte typeDescriptor, final int missingValue) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
final int i = decodeInt(type);
return i == type.getMissingBytes() ? missingValue : i;
}
@Requires("type != null")
public final int decodeInt(final BCF2Type type) {
return BCF2Utils.readInt(type.getSizeInBytes(), recordStream);
}
/**
* Low-level reader for int[]
*
* Requires a typeDescriptor so the function knows how many elements to read,
* and how they are encoded.
*
* If size == 0 => result is null
* If size > 0 => result depends on the actual values in the stream
* -- If the first element read is MISSING, result is null (all values are missing)
* -- Else result = int[N] where N is the first N non-missing values decoded
*
* @param maybeDest if not null we'll not allocate space for the vector, but instead use
* the externally allocated array of ints to store values. If the
* size of this vector is < the actual size of the elements, we'll be
* forced to use freshly allocated arrays. Also note that padded
* int elements are still forced to do a fresh allocation as well.
* @return see description
*/
@Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0", "type != null"})
public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) {
if ( size == 0 ) {
return null;
} else {
if ( maybeDest != null && maybeDest.length < size )
maybeDest = null; // by nulling this out we ensure that we do fresh allocations as maybeDest is too small
final int val1 = decodeInt(type);
if ( val1 == type.getMissingBytes() ) {
// fast path for first element being missing
for ( int i = 1; i < size; i++ ) decodeInt(type);
return null;
} else {
// we know we will have at least 1 element, so making the int[] is worth it
final int[] ints = maybeDest == null ? new int[size] : maybeDest;
ints[0] = val1; // we already read the first one
for ( int i = 1; i < size; i++ ) {
ints[i] = decodeInt(type);
if ( ints[i] == type.getMissingBytes() ) {
// read the rest of the missing values, dropping them
for ( int j = i + 1; j < size; j++ ) decodeInt(type);
// deal with auto-pruning by returning an int[] containing
// only the non-MISSING values. We do this by copying the first
// i elements, as i itself is missing
return Arrays.copyOf(ints, i);
}
}
return ints; // all of the elements were non-MISSING
}
}
}
public final int[] decodeIntArray(final byte typeDescriptor) {
final int size = decodeNumberOfElements(typeDescriptor);
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
return decodeIntArray(size, type, null);
} }
public final double rawFloatToFloat(final int rawFloat) { public final double rawFloatToFloat(final int rawFloat) {

View File

@ -0,0 +1,282 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import java.util.*;
/**
* An efficient scheme for building and obtaining specialized
* genotype field decoders. Used by the BCFCodec to parse
* with little overhead the fields from BCF2 encoded genotype
* records
*
* @author Mark DePristo
* @since 6/12
*/
public class BCF2GenotypeFieldDecoders {
final protected static Logger logger = Logger.getLogger(BCF2GenotypeFieldDecoders.class);
private final static boolean ENABLE_FASTPATH_GT = true;
private final static int MIN_SAMPLES_FOR_FASTPATH_GENOTYPES = 0; // TODO -- update to reasonable number
// initialized once per writer to allow parallel writers to work
private final HashMap<String, Decoder> genotypeFieldDecoder = new HashMap<String, Decoder>();
private final Decoder defaultDecoder = new GenericDecoder();
public BCF2GenotypeFieldDecoders(final VCFHeader header) {
// TODO -- fill in appropriate decoders for each FORMAT field in the header
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder());
// currently the generic decoder handles FILTER values properly, in so far as we don't tolerate multiple filter field values per genotype
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FTDecoder());
genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder());
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder());
genotypeFieldDecoder.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, new PLDecoder());
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_QUALITY_KEY, new GQDecoder());
}
// -----------------------------------------------------------------
//
// Genotype field decoder
//
// -----------------------------------------------------------------
/**
* Return decoder appropriate for field, or the generic decoder if no
* specialized one is bound
* @param field the GT field to decode
* @return a non-null decoder
*/
@Requires("field != null")
@Ensures("result != null")
public Decoder getDecoder(final String field) {
final Decoder d = genotypeFieldDecoder.get(field);
return d == null ? defaultDecoder : d;
}
/**
* Decoder a field (implicit from creation) encoded as
* typeDescriptor in the decoder object in the GenotypeBuilders
* one for each sample in order.
*
* The way this works is that this decode method
* iterates over the builders, decoding a genotype field
* in BCF2 for each sample from decoder.
*
* This system allows us to easily use specialized
* decoders for specific genotype field values. For example,
* we use a special decoder to directly read the BCF2 data for
* the PL field into a int[] rather than the generic List of Integer
*/
public interface Decoder {
@Requires({"siteAlleles != null", "! siteAlleles.isEmpty()",
"field != null", "decoder != null", "gbs != null", "! gbs.isEmpty()"})
public void decode(final List<Allele> siteAlleles,
final String field,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs);
}
private class GTDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
// we have to do a bit of low-level processing here as we want to know the size upfronta
final int ploidy = decoder.decodeNumberOfElements(typeDescriptor);
if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && ploidy == 2 && gbs.size() >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES )
fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs);
else {
generalDecode(siteAlleles, ploidy, decoder, typeDescriptor, gbs);
}
}
/**
* fast path for many samples with diploid genotypes
*
* The way this would work is simple. Create a List<Allele> diploidGenotypes[] object
* After decoding the offset, if that sample is diploid compute the
* offset into the alleles vector which is simply offset = allele0 * nAlleles + allele1
* if there's a value at diploidGenotypes[offset], use it, otherwise create the genotype
* cache it and use that
*
* Some notes. If there are nAlleles at the site, there are implicitly actually
* n + 1 options including
*/
@Requires("siteAlleles.size() == 2")
@SuppressWarnings({"unchecked"})
private final void fastBiallelicDiploidDecode(final List<Allele> siteAlleles,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
final int nPossibleGenotypes = 3 * 3;
final Object allGenotypes[] = new Object[nPossibleGenotypes];
for ( final GenotypeBuilder gb : gbs ) {
final int a1 = decoder.decodeInt(type);
final int a2 = decoder.decodeInt(type);
if ( a1 == type.getMissingBytes() ) {
assert a2 == type.getMissingBytes();
// no called sample GT = .
gb.alleles(null);
} else if ( a2 == type.getMissingBytes() ) {
gb.alleles(Arrays.asList(getAlleleFromEncoded(siteAlleles, a1)));
} else {
// downshift to remove phase
final int offset = (a1 >> 1) * 3 + (a2 >> 1);
assert offset < allGenotypes.length;
// TODO -- how can I get rid of this cast?
List<Allele> gt = (List<Allele>)allGenotypes[offset];
if ( gt == null ) {
final Allele allele1 = getAlleleFromEncoded(siteAlleles, a1);
final Allele allele2 = getAlleleFromEncoded(siteAlleles, a2);
gt = Arrays.asList(allele1, allele2);
allGenotypes[offset] = gt;
}
gb.alleles(gt);
}
}
}
private final void generalDecode(final List<Allele> siteAlleles,
final int ploidy,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
// a single cache for the encoded genotypes, since we don't actually need this vector
final int[] tmp = new int[ploidy];
for ( final GenotypeBuilder gb : gbs ) {
final int[] encoded = decoder.decodeIntArray(ploidy, type, tmp);
if ( encoded == null )
// no called sample GT = .
gb.alleles(null);
else {
assert encoded.length > 0;
// we have at least some alleles to decode
final List<Allele> gt = new ArrayList<Allele>(encoded.length);
// note that the auto-pruning of fields magically handles different
// ploidy per sample at a site
for ( final int encode : encoded )
gt.add(getAlleleFromEncoded(siteAlleles, encode));
gb.alleles(gt);
}
}
}
@Requires({"siteAlleles != null && ! siteAlleles.isEmpty()", "encode >= 0"})
@Ensures("result != null")
private final Allele getAlleleFromEncoded(final List<Allele> siteAlleles, final int encode) {
final int offset = encode >> 1;
return offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1);
}
}
private class DPDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
// the -1 is for missing
gb.DP(decoder.decodeInt(typeDescriptor, -1));
}
}
}
private class GQDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
// the -1 is for missing
gb.GQ(decoder.decodeInt(typeDescriptor, -1));
}
}
}
private class ADDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
gb.AD(decoder.decodeIntArray(typeDescriptor));
}
}
}
private class PLDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
gb.PL(decoder.decodeIntArray(typeDescriptor));
}
}
}
private class GenericDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
Object value = decoder.decodeTypedValue(typeDescriptor);
if ( value != null ) { // don't add missing values
if ( value instanceof List && ((List)value).size() == 1) {
// todo -- I really hate this, and it suggests that the code isn't completely right
// the reason it's here is that it's possible to prune down a vector to a singleton
// value and there we have the contract that the value comes back as an atomic value
// not a vector of size 1
value = ((List)value).get(0);
}
gb.attribute(field, value);
}
}
}
}
private class FTDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
Object value = decoder.decodeTypedValue(typeDescriptor);
if ( value != null ) { // don't add missing values
gb.filters(value instanceof String ? Collections.singletonList((String)value) : (List<String>)value);
}
}
}
}
}

View File

@ -0,0 +1,103 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
/**
* Lazy version of genotypes decoder for BCF2 genotypes
*
* @author Mark DePristo
* @since 5/12
*/
class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
final protected static Logger logger = Logger.getLogger(BCF2LazyGenotypesDecoder.class);
// the essential information for us to use to decode the genotypes data
// initialized when this lazy decoder is created, as we know all of this from the BCF2Codec
// and its stored here again for code cleanliness
private final BCF2Codec codec;
private final ArrayList<Allele> siteAlleles;
private final int nSamples;
private final int nFields;
BCF2LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList<Allele> alleles, final int nSamples, final int nFields) {
this.codec = codec;
this.siteAlleles = alleles;
this.nSamples = nSamples;
this.nFields = nFields;
}
@Override
public LazyGenotypesContext.LazyData parse(final Object data) {
if ( logger.isDebugEnabled() )
logger.debug("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each");
// load our byte[] data into the decoder
final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes);
// TODO -- fast path for sites only
// go ahead and decode everyone
final List<String> samples = new ArrayList<String>(codec.getHeader().getGenotypeSamples());
if ( samples.size() != nSamples )
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
"different numbers of samples per record. Saw " + samples.size() +
" samples in header but have a record with " + nSamples + " samples");
// create and initialize the genotypes array
final ArrayList<GenotypeBuilder> builders = new ArrayList<GenotypeBuilder>(nSamples);
for ( int i = 0; i < nSamples; i++ ) {
builders.add(new GenotypeBuilder(samples.get(i)));
}
for ( int i = 0; i < nFields; i++ ) {
// get the field name
final int offset = (Integer) decoder.decodeTypedValue();
final String field = codec.getDictionaryString(offset);
// the type of each element
final byte typeDescriptor = decoder.readTypeDescriptor();
final BCF2GenotypeFieldDecoders.Decoder fieldDecoder = codec.getGenotypeFieldDecoder(field);
try {
fieldDecoder.decode(siteAlleles, field, decoder, typeDescriptor, builders);
} catch ( ClassCastException e ) {
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
+ " inconsistent with the value observed in the decoded value");
}
}
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
for ( final GenotypeBuilder gb : builders )
genotypes.add(gb.make());
return new LazyGenotypesContext.LazyData(genotypes, codec.getHeader().getSampleNamesInOrder(), codec.getHeader().getSampleNameToOffset());
}
}

View File

@ -1,143 +0,0 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.broad.tribble.FeatureCodecHeader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.broadinstitute.sting.utils.variantcontext.writer.Options;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
import java.io.*;
import java.util.*;
/**
* Testing BCF2
*
* @author Mark DePristo
* @since 2012
*/
public class BCF2TestWalker extends RodWalker<Integer, Integer> {
/**
* Variants from this VCF file are used by this tool as input.
* The file must at least contain the standard VCF header lines, but
* can be empty (i.e., no variants are contained in the file).
*/
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
public RodBinding<VariantContext> variants;
@Argument(doc="keep variants", required=false)
public boolean keepVariants = false;
@Argument(doc="quiet", required=false)
public boolean quiet = false;
@Argument(doc="dontIndexOnTheFly", required=false)
public boolean dontIndexOnTheFly = false;
@Output(doc="File to which results should be written",required=true)
protected File bcfFile;
private final List<VariantContext> vcs = new ArrayList<VariantContext>();
protected VariantContextWriter writer;
@Override
public void initialize() {
final Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Collections.singletonList(variants));
final VCFHeader header = VCFUtils.withUpdatedContigs(vcfRods.values().iterator().next(), getToolkit());
try {
EnumSet<Options> options = EnumSet.of(Options.FORCE_BCF);
if ( !dontIndexOnTheFly ) options.add(Options.INDEX_ON_THE_FLY);
writer = VariantContextWriterFactory.create(bcfFile, new FileOutputStream(bcfFile), getToolkit().getMasterSequenceDictionary(), options);
writer.writeHeader(header);
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(bcfFile, e);
}
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) // RodWalkers can make funky map calls
return 0;
for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) {
writer.add(vc);
if ( keepVariants ) vcs.add(vc);
}
return 1;
}
//
// default reduce -- doesn't do anything at all
//
public Integer reduceInit() { return 0; }
public Integer reduce(Integer counter, Integer sum) { return counter + sum; }
public void onTraversalDone(Integer sum) {
try {
writer.close();
logger.info("Closed writer");
// read in the BCF records
BCF2Codec codec = new BCF2Codec();
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(bcfFile));
FeatureCodecHeader header = codec.readHeader(pbs);
pbs.close();
pbs = new PositionalBufferedStream(new FileInputStream(bcfFile));
pbs.skip(header.getHeaderEnd());
Iterator<VariantContext> it = vcs.iterator();
while ( ! pbs.isDone() ) {
if ( keepVariants ) {
VariantContext expected = it.next();
if ( ! quiet )
System.out.printf("vcf = %s %d %s%n", expected.getChr(), expected.getStart(), expected);
}
VariantContext bcfRaw = codec.decode(pbs);
VariantContext bcf = new VariantContextBuilder(bcfRaw).source("variant").make();
if ( ! quiet ) {
System.out.printf("bcf = %s %d %s%n", bcf.getChr(), bcf.getStart(), bcf.toString());
System.out.printf("--------------------------------------------------%n");
}
}
} catch ( IOException e ) {
throw new UserException.CouldNotCreateOutputFile(bcfFile, "bad user!");
}
}
}

View File

@ -24,18 +24,22 @@
package org.broadinstitute.sting.utils.codecs.bcf2; package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Requires;
import java.util.EnumSet;
/** /**
* BCF2 types and information * BCF2 types and associated information
* *
* @author depristo * @author depristo
* @since 05/12 * @since 05/12
*/ */
public enum BCF2Type { public enum BCF2Type {
INT8(1, 1, BCF2Utils.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range INT8 (1, 1, 0xFFFFFF80, -127, 127), // todo -- confirm range
INT16(2, 2, BCF2Utils.INT16_MISSING_VALUE, -32767, 32767), INT16(2, 2, 0xFFFF8000, -32767, 32767),
INT32(3, 4, BCF2Utils.INT32_MISSING_VALUE, -2147483647, 2147483647), INT32(3, 4, 0x80000000, -2147483647, 2147483647),
FLOAT(5, 4, BCF2Utils.FLOAT_MISSING_VALUE), FLOAT(5, 4, 0x7F800001),
CHAR(7); CHAR (7, 1, 0x00000000);
private final int id; private final int id;
private final Object missingJavaValue; private final Object missingJavaValue;
@ -60,11 +64,53 @@ public enum BCF2Type {
this.maxValue = maxValue; this.maxValue = maxValue;
} }
/**
* How many bytes are used to represent this type on disk?
* @return
*/
public int getSizeInBytes() { public int getSizeInBytes() {
return sizeInBytes; return sizeInBytes;
} }
/**
* The ID according to the BCF2 specification
* @return
*/
public int getID() { return id; } public int getID() { return id; }
/**
* Can we encode value v in this type, according to its declared range.
*
* Only makes sense for integer values
*
* @param v
* @return
*/
@Requires("INTEGERS.contains(this)")
public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; } public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; }
/**
* Return the java object (aka null) that is used to represent a missing value for this
* type in Java
*
* @return
*/
public Object getMissingJavaValue() { return missingJavaValue; } public Object getMissingJavaValue() { return missingJavaValue; }
/**
* The bytes (encoded as an int) that are used to represent a missing value
* for this type in BCF2
*
* @return
*/
public int getMissingBytes() { return missingBytes; } public int getMissingBytes() { return missingBytes; }
/**
* An enum set of the types that might represent Integer values
*/
public final static EnumSet<BCF2Type> INTEGERS = EnumSet.of(INT8, INT16, INT32);
public boolean isIntegerType() {
return INTEGERS.contains(this);
}
} }

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.codecs.bcf2; package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
@ -33,9 +35,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList; import java.io.OutputStream;
import java.util.Arrays; import java.util.*;
import java.util.List;
/** /**
* Common utilities for working with BCF2 files * Common utilities for working with BCF2 files
@ -45,7 +46,7 @@ import java.util.List;
* @author depristo * @author depristo
* @since 5/12 * @since 5/12
*/ */
public class BCF2Utils { public final class BCF2Utils {
public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes(); public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes();
public static final int MAX_ALLELES_IN_GENOTYPES = 127; public static final int MAX_ALLELES_IN_GENOTYPES = 127;
@ -53,12 +54,6 @@ public class BCF2Utils {
public static final int OVERFLOW_ELEMENT_MARKER = 15; public static final int OVERFLOW_ELEMENT_MARKER = 15;
public static final int MAX_INLINE_ELEMENTS = 14; public static final int MAX_INLINE_ELEMENTS = 14;
// Note that these values are prefixed by FFFFFF for convenience
public static final int INT8_MISSING_VALUE = 0xFFFFFF80;
public static final int INT16_MISSING_VALUE = 0xFFFF8000;
public static final int INT32_MISSING_VALUE = 0x80000000;
public static final int FLOAT_MISSING_VALUE = 0x7F800001;
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32}; public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32};
public final static BCF2Type[] ID_TO_ENUM; public final static BCF2Type[] ID_TO_ENUM;
@ -77,11 +72,17 @@ public class BCF2Utils {
* The dictionary is an ordered list of common VCF identifers (FILTER, INFO, and FORMAT) * The dictionary is an ordered list of common VCF identifers (FILTER, INFO, and FORMAT)
* fields. * fields.
* *
* Note that its critical that the list be dedupped and sorted in a consistent manner each time,
* as the BCF2 offsets are encoded relative to this dictionary, and if it isn't determined exactly
* the same way as in the header each time it's very bad
*
* @param header the VCFHeader from which to build the dictionary * @param header the VCFHeader from which to build the dictionary
* @return a non-null dictionary of elements, may be empty * @return a non-null dictionary of elements, may be empty
*/ */
@Requires("header != null")
@Ensures({"result != null", "new HashSet(result).size() == result.size()"})
public final static ArrayList<String> makeDictionary(final VCFHeader header) { public final static ArrayList<String> makeDictionary(final VCFHeader header) {
final ArrayList<String> dict = new ArrayList<String>(); final Set<String> dict = new TreeSet<String>();
// set up the strings dictionary // set up the strings dictionary
dict.add(VCFConstants.PASSES_FILTERS_v4); // special case the special PASS field dict.add(VCFConstants.PASSES_FILTERS_v4); // special case the special PASS field
@ -92,23 +93,27 @@ public class BCF2Utils {
} }
} }
return dict; return new ArrayList<String>(dict);
} }
@Requires({"nElements >= 0", "type != null"})
public final static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) { public final static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) {
int encodeSize = Math.min(nElements, OVERFLOW_ELEMENT_MARKER); int encodeSize = Math.min(nElements, OVERFLOW_ELEMENT_MARKER);
byte typeByte = (byte)((0x0F & encodeSize) << 4 | (type.getID() & 0x0F)); byte typeByte = (byte)((0x0F & encodeSize) << 4 | (type.getID() & 0x0F));
return typeByte; return typeByte;
} }
@Ensures("result >= 0")
public final static int decodeSize(final byte typeDescriptor) { public final static int decodeSize(final byte typeDescriptor) {
return (0xF0 & typeDescriptor) >> 4; return (0xF0 & typeDescriptor) >> 4;
} }
@Ensures("result >= 0")
public final static int decodeTypeID(final byte typeDescriptor) { public final static int decodeTypeID(final byte typeDescriptor) {
return typeDescriptor & 0x0F; return typeDescriptor & 0x0F;
} }
@Ensures("result != null")
public final static BCF2Type decodeType(final byte typeDescriptor) { public final static BCF2Type decodeType(final byte typeDescriptor) {
return ID_TO_ENUM[decodeTypeID(typeDescriptor)]; return ID_TO_ENUM[decodeTypeID(typeDescriptor)];
} }
@ -117,6 +122,7 @@ public class BCF2Utils {
return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER; return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER;
} }
@Requires("nElements >= 0")
public final static boolean willOverflow(final long nElements) { public final static boolean willOverflow(final long nElements) {
return nElements > MAX_INLINE_ELEMENTS; return nElements > MAX_INLINE_ELEMENTS;
} }
@ -128,6 +134,7 @@ public class BCF2Utils {
} }
public final static byte readByte(final InputStream stream) { public final static byte readByte(final InputStream stream) {
// TODO -- shouldn't be capturing error here
try { try {
return (byte)(stream.read() & 0xFF); return (byte)(stream.read() & 0xFF);
} catch ( IOException e ) { } catch ( IOException e ) {
@ -135,6 +142,7 @@ public class BCF2Utils {
} }
} }
@Requires({"stream != null", "bytesForEachInt > 0"})
public final static int readInt(int bytesForEachInt, final InputStream stream) { public final static int readInt(int bytesForEachInt, final InputStream stream) {
switch ( bytesForEachInt ) { switch ( bytesForEachInt ) {
case 1: { case 1: {
@ -161,10 +169,10 @@ public class BCF2Utils {
* @param strings size > 1 list of strings * @param strings size > 1 list of strings
* @return * @return
*/ */
@Requires({"strings != null", "strings.size() > 1"})
@Ensures("result != null")
public static final String collapseStringList(final List<String> strings) { public static final String collapseStringList(final List<String> strings) {
assert strings.size() > 1; final StringBuilder b = new StringBuilder();
StringBuilder b = new StringBuilder();
for ( final String s : strings ) { for ( final String s : strings ) {
assert s.indexOf(",") == -1; // no commas in individual strings assert s.indexOf(",") == -1; // no commas in individual strings
b.append(",").append(s); b.append(",").append(s);
@ -181,12 +189,15 @@ public class BCF2Utils {
* @param collapsed * @param collapsed
* @return * @return
*/ */
@Requires({"collapsed != null", "isCollapsedString(collapsed)"})
@Ensures("result != null")
public static final List<String> exploreStringList(final String collapsed) { public static final List<String> exploreStringList(final String collapsed) {
assert isCollapsedString(collapsed); assert isCollapsedString(collapsed);
final String[] exploded = collapsed.substring(1).split(","); final String[] exploded = collapsed.substring(1).split(",");
return Arrays.asList(exploded); return Arrays.asList(exploded);
} }
@Requires("s != null")
public static final boolean isCollapsedString(final String s) { public static final boolean isCollapsedString(final String s) {
return s.charAt(0) == ','; return s.charAt(0) == ',';
} }
@ -200,6 +211,8 @@ public class BCF2Utils {
* @param vcfFile * @param vcfFile
* @return * @return
*/ */
@Requires("vcfFile != null")
@Ensures("result != null")
public static final File shadowBCF(final File vcfFile) { public static final File shadowBCF(final File vcfFile) {
final String path = vcfFile.getAbsolutePath(); final String path = vcfFile.getAbsolutePath();
if ( path.contains(".vcf") ) if ( path.contains(".vcf") )
@ -207,4 +220,109 @@ public class BCF2Utils {
else else
return new File( path + ".bcf" ); return new File( path + ".bcf" );
} }
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type determineIntegerType(final int value) {
for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) {
if ( potentialType.withinRange(value) )
return potentialType;
}
throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value);
}
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type determineIntegerType(final int[] values) {
// literally a copy of the code below, but there's no general way to unify lists and arrays in java
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {
final BCF2Type type1 = determineIntegerType(value);
switch ( type1 ) {
case INT8: break;
case INT16: maxType = BCF2Type.INT16; break;
case INT32: return BCF2Type.INT32; // fast path for largest possible value
default: throw new ReviewedStingException("Unexpected integer type " + type1 );
}
}
return maxType;
}
/**
* Returns the maximum BCF2 integer size of t1 and t2
*
* For example, if t1 == INT8 and t2 == INT16 returns INT16
*
* @param t1
* @param t2
* @return
*/
@Requires({"BCF2Type.INTEGERS.contains(t1)","BCF2Type.INTEGERS.contains(t2)"})
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) {
switch ( t1 ) {
case INT8: return t2;
case INT16: return t2 == BCF2Type.INT32 ? t2 : t1;
case INT32: return t1;
default: throw new ReviewedStingException("BUG: unexpected BCF2Type " + t1);
}
}
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type determineIntegerType(final List<Integer> values) {
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {
final BCF2Type type1 = determineIntegerType(value);
switch ( type1 ) {
case INT8: break;
case INT16: maxType = BCF2Type.INT16; break;
case INT32: return BCF2Type.INT32; // fast path for largest possible value
default: throw new ReviewedStingException("Unexpected integer type " + type1 );
}
}
return maxType;
}
/**
* Helper function that takes an object and returns a list representation
* of it:
*
* o == null => []
* o is a list => o
* else => [o]
*
* @param o
* @return
*/
public final static List<Object> toList(final Object o) {
if ( o == null ) return Collections.emptyList();
else if ( o instanceof List ) return (List<Object>)o;
else return Collections.singletonList(o);
}
public final static void encodeRawBytes(final int value, final BCF2Type type, final OutputStream encodeStream) throws IOException {
switch ( type.getSizeInBytes() ) {
case 1:
encodeStream.write(0xFF & value);
break;
case 2:
encodeStream.write((0xFF00 & value) >> 8);
encodeStream.write(0xFF & value);
break;
case 4:
encodeStream.write((0xFF000000 & value) >> 24);
encodeStream.write((0x00FF0000 & value) >> 16);
encodeStream.write((0x0000FF00 & value) >> 8);
encodeStream.write((0x000000FF & value));
break;
default:
throw new ReviewedStingException("BUG: unexpected type size " + type);
}
// general case for reference
// for ( int i = type.getSizeInBytes() - 1; i >= 0; i-- ) {
// final int shift = i * 8;
// int mask = 0xFF << shift;
// int byteValue = (mask & value) >> shift;
// encodeStream.write(byteValue);
// }
}
} }

View File

@ -28,6 +28,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
// we have to store the list of strings that make up the header until they're needed // we have to store the list of strings that make up the header until they're needed
protected VCFHeader header = null; protected VCFHeader header = null;
protected VCFHeaderVersion version = null;
// a mapping of the allele // a mapping of the allele
protected Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3); protected Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
@ -48,7 +49,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
protected final String[] locParts = new String[6]; protected final String[] locParts = new String[6];
// for performance we cache the hashmap of filter encodings for quick lookup // for performance we cache the hashmap of filter encodings for quick lookup
protected HashMap<String,LinkedHashSet<String>> filterHash = new HashMap<String,LinkedHashSet<String>>(); protected HashMap<String,List<String>> filterHash = new HashMap<String,List<String>>();
// we store a name to give to each of the variant contexts we emit // we store a name to give to each of the variant contexts we emit
protected String name = "Unknown"; protected String name = "Unknown";
@ -91,24 +92,12 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
*/ */
public abstract Object readHeader(LineReader reader); public abstract Object readHeader(LineReader reader);
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @param chr chrom
* @param pos position
* @return a mapping of sample name to genotype object
*/
public abstract LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos);
/** /**
* parse the filter string, first checking to see if we already have parsed it in a previous attempt * parse the filter string, first checking to see if we already have parsed it in a previous attempt
* @param filterString the string to parse * @param filterString the string to parse
* @return a set of the filters applied * @return a set of the filters applied
*/ */
protected abstract Set<String> parseFilters(String filterString); protected abstract List<String> parseFilters(String filterString);
/** /**
* create a VCF header from a set of header record lines * create a VCF header from a set of header record lines
@ -117,6 +106,8 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
* @return a VCFHeader object * @return a VCFHeader object
*/ */
protected VCFHeader parseHeaderFromLines( final List<String> headerStrings, final VCFHeaderVersion version ) { protected VCFHeader parseHeaderFromLines( final List<String> headerStrings, final VCFHeaderVersion version ) {
this.version = version;
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>(); Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
Set<String> sampleNames = new LinkedHashSet<String>(); Set<String> sampleNames = new LinkedHashSet<String>();
int contigCounter = 0; int contigCounter = 0;
@ -320,7 +311,9 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
String ref = getCachedString(parts[3].toUpperCase()); String ref = getCachedString(parts[3].toUpperCase());
String alts = getCachedString(parts[4].toUpperCase()); String alts = getCachedString(parts[4].toUpperCase());
builder.log10PError(parseQual(parts[5])); builder.log10PError(parseQual(parts[5]));
builder.filters(parseFilters(getCachedString(parts[6])));
final List<String> filters = parseFilters(getCachedString(parts[6]));
if ( filters != null ) builder.filters(new HashSet<String>(filters));
final Map<String, Object> attrs = parseInfo(parts[7]); final Map<String, Object> attrs = parseInfo(parts[7]);
builder.attributes(attrs); builder.attributes(attrs);
@ -719,4 +712,115 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
try { stream.close(); } catch ( IOException e ) {} try { stream.close(); } catch ( IOException e ) {}
} }
} }
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @return a mapping of sample name to genotype object
*/
public LazyGenotypesContext.LazyData createGenotypeMap(final String str,
final List<Allele> alleles,
final String chr,
final int pos) {
if (genotypeParts == null)
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
if ( nParts != genotypeParts.length )
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo);
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
// get the format keys
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
// cycle through the sample names
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
// clear out our allele mapping
alleleMap.clear();
// cycle through the genotype strings
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
final String sampleName = sampleNameIterator.next();
final GenotypeBuilder gb = new GenotypeBuilder(sampleName);
// check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize)
generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]);
int genotypeAlleleLocation = -1;
if (nGTKeys >= 1) {
gb.maxAttributes(nGTKeys - 1);
for (int i = 0; i < nGTKeys; i++) {
final String gtKey = genotypeKeyArray[i];
boolean missing = i >= GTValueSplitSize;
// todo -- all of these on the fly parsing of the missing value should be static constants
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
genotypeAlleleLocation = i;
} else if ( missing ) {
// if its truly missing (there no provided value) skip adding it to the attributes
} else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) {
final List<String> filters = parseFilters(getCachedString(GTValueArray[i]));
if ( filters != null ) gb.filters(filters);
} else if ( GTValueArray[i].equals(VCFConstants.MISSING_VALUE_v4) ) {
// don't add missing values to the map
} else {
if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
if ( GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3) )
gb.noGQ();
else
gb.GQ((int)Math.round(Double.valueOf(GTValueArray[i])));
} else if (gtKey.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) {
gb.AD(decodeInts(GTValueArray[i]));
} else if (gtKey.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY)) {
gb.PL(decodeInts(GTValueArray[i]));
} else if (gtKey.equals(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
gb.PL(GenotypeLikelihoods.fromGLField(GTValueArray[i]).getAsPLs());
} else if (gtKey.equals(VCFConstants.DEPTH_KEY)) {
gb.DP(Integer.valueOf(GTValueArray[i]));
} else {
gb.attribute(gtKey, GTValueArray[i]);
}
}
}
}
// check to make sure we found a genotype field if our version is less than 4.1 file
if ( version != VCFHeaderVersion.VCF4_1 && genotypeAlleleLocation == -1 )
generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
final List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? new ArrayList<Allele>(0) : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap));
gb.alleles(GTalleles);
gb.phased(genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1);
// add it to the list
try {
genotypes.add(gb.make());
} catch (TribbleException e) {
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
}
}
return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset());
}
private final static String[] INT_DECODE_ARRAY = new String[10000];
private final static int[] decodeInts(final String string) {
final int nValues = ParsingUtils.split(string, INT_DECODE_ARRAY, ',');
final int[] values = new int[nValues];
for ( int i = 0; i < nValues; i++ )
values[i] = Integer.valueOf(INT_DECODE_ARRAY[i]);
return values;
}
} }

View File

@ -1,3 +1,27 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.vcf; package org.broadinstitute.sting.utils.codecs.vcf;
import org.broad.tribble.TribbleException; import org.broad.tribble.TribbleException;
@ -78,24 +102,24 @@ public class VCF3Codec extends AbstractVCFCodec {
* @param filterString the string to parse * @param filterString the string to parse
* @return a set of the filters applied * @return a set of the filters applied
*/ */
protected Set<String> parseFilters(String filterString) { protected List<String> parseFilters(String filterString) {
// null for unfiltered // null for unfiltered
if ( filterString.equals(VCFConstants.UNFILTERED) ) if ( filterString.equals(VCFConstants.UNFILTERED) )
return null; return null;
// empty set for passes filters // empty set for passes filters
LinkedHashSet<String> fFields = new LinkedHashSet<String>(); List<String> fFields = new ArrayList<String>();
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
return fFields; return new ArrayList<String>(fFields);
if ( filterString.length() == 0 ) if ( filterString.length() == 0 )
generateException("The VCF specification requires a valid filter status"); generateException("The VCF specification requires a valid filter status");
// do we have the filter string cached? // do we have the filter string cached?
if ( filterHash.containsKey(filterString) ) if ( filterHash.containsKey(filterString) )
return filterHash.get(filterString); return new ArrayList<String>(filterHash.get(filterString));
// otherwise we have to parse and cache the value // otherwise we have to parse and cache the value
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
@ -108,93 +132,6 @@ public class VCF3Codec extends AbstractVCFCodec {
return fFields; return fFields;
} }
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @param chr chrom
* @param pos position
* @return a mapping of sample name to genotype object
*/
public LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
if (genotypeParts == null)
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
if ( nParts != genotypeParts.length )
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo);
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
// get the format keys
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
// cycle through the sample names
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
// clear out our allele mapping
alleleMap.clear();
// cycle through the genotype strings
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
double GTQual = VariantContext.NO_LOG10_PERROR;
Set<String> genotypeFilters = null;
Map<String, Object> gtAttributes = null;
String sampleName = sampleNameIterator.next();
// check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize)
generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]);
int genotypeAlleleLocation = -1;
if (nGTKeys >= 1) {
gtAttributes = new HashMap<String, Object>(nGTKeys - 1);
for (int i = 0; i < nGTKeys; i++) {
final String gtKey = new String(genotypeKeyArray[i]);
boolean missing = i >= GTValueSplitSize;
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
genotypeAlleleLocation = i;
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
} else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) {
genotypeFilters = missing ? parseFilters(VCFConstants.MISSING_VALUE_v4) : parseFilters(getCachedString(GTValueArray[i]));
} else if ( missing || GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3) ) {
gtAttributes.put(gtKey, VCFConstants.MISSING_VALUE_v4);
} else {
gtAttributes.put(gtKey, new String(GTValueArray[i]));
}
}
}
// check to make sure we found a genotype field
if ( genotypeAlleleLocation < 0 )
generateException("Unable to find the GT field for the record; the GT field is required");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes");
boolean phased = GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
// add it to the list
try {
genotypes.add(new Genotype(sampleName,
parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap),
GTQual,
genotypeFilters,
gtAttributes,
phased));
} catch (TribbleException e) {
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
}
}
return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset);
}
@Override @Override
public boolean canDecode(final String potentialInput) { public boolean canDecode(final String potentialInput) {
return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER); return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER);

View File

@ -48,7 +48,6 @@ import java.util.*;
public class VCFCodec extends AbstractVCFCodec { public class VCFCodec extends AbstractVCFCodec {
// Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4";
private VCFHeaderVersion version = null;
/** /**
* A VCF header the contains master info/filter/format records that we use to 'fill in' * A VCF header the contains master info/filter/format records that we use to 'fill in'
@ -127,121 +126,33 @@ public class VCFCodec extends AbstractVCFCodec {
* @param filterString the string to parse * @param filterString the string to parse
* @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF) * @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF)
*/ */
protected Set<String> parseFilters(String filterString) { protected List<String> parseFilters(String filterString) {
return parseFilters(filterHash, lineNo, filterString);
}
public static Set<String> parseFilters(final Map<String, LinkedHashSet<String>> cache, final int lineNo, final String filterString) {
// null for unfiltered // null for unfiltered
if ( filterString.equals(VCFConstants.UNFILTERED) ) if ( filterString.equals(VCFConstants.UNFILTERED) )
return null; return null;
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) )
return Collections.emptySet(); return Collections.emptyList();
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo); generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo);
if ( filterString.length() == 0 ) if ( filterString.length() == 0 )
generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo);
// do we have the filter string cached? // do we have the filter string cached?
if ( cache != null && cache.containsKey(filterString) ) if ( filterHash.containsKey(filterString) )
return Collections.unmodifiableSet(cache.get(filterString)); return filterHash.get(filterString);
// empty set for passes filters // empty set for passes filters
LinkedHashSet<String> fFields = new LinkedHashSet<String>(); List<String> fFields = new LinkedList<String>();
// otherwise we have to parse and cache the value // otherwise we have to parse and cache the value
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
fFields.add(filterString); fFields.add(filterString);
else else
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
fFields = fFields; filterHash.put(filterString, Collections.unmodifiableList(fFields));
if ( cache != null ) cache.put(filterString, fFields);
return Collections.unmodifiableSet(fFields); return fFields;
}
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @return a mapping of sample name to genotype object
*/
public LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
if (genotypeParts == null)
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
if ( nParts != genotypeParts.length )
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo);
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
// get the format keys
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
// cycle through the sample names
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
// clear out our allele mapping
alleleMap.clear();
// cycle through the genotype strings
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
double GTQual = VariantContext.NO_LOG10_PERROR;
Set<String> genotypeFilters = null;
Map<String, Object> gtAttributes = null;
String sampleName = sampleNameIterator.next();
// check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize)
generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]);
int genotypeAlleleLocation = -1;
if (nGTKeys >= 1) {
gtAttributes = new HashMap<String, Object>(nGTKeys - 1);
for (int i = 0; i < nGTKeys; i++) {
final String gtKey = new String(genotypeKeyArray[i]);
boolean missing = i >= GTValueSplitSize;
// todo -- all of these on the fly parsing of the missing value should be static constants
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
genotypeAlleleLocation = i;
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
} else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) {
genotypeFilters = missing ? parseFilters(VCFConstants.MISSING_VALUE_v4) : parseFilters(getCachedString(GTValueArray[i]));
} else if ( missing ) {
// if its truly missing (there no provided value) skip adding it to the attributes
} else {
gtAttributes.put(gtKey, GTValueArray[i]);
}
}
}
// check to make sure we found a genotype field if we are a VCF4.0 file
if ( version == VCFHeaderVersion.VCF4_0 && genotypeAlleleLocation == -1 )
generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? new ArrayList<Allele>(0) : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap));
boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
// add it to the list
try {
genotypes.add(new Genotype(sampleName, GTalleles, GTQual, genotypeFilters, gtAttributes, phased));
} catch (TribbleException e) {
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
}
}
return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset);
} }
@Override @Override

View File

@ -56,8 +56,9 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
public String getDescription() { return description; } public String getDescription() { return description; }
public VCFHeaderLineType getType() { return type; } public VCFHeaderLineType getType() { return type; }
public VCFHeaderLineCount getCountType() { return countType; } public VCFHeaderLineCount getCountType() { return countType; }
public boolean isFixedCount() { return countType == VCFHeaderLineCount.INTEGER; }
public int getCount() { public int getCount() {
if ( countType != VCFHeaderLineCount.INTEGER ) if ( ! isFixedCount() )
throw new ReviewedStingException("Asking for header line count when type is not an integer"); throw new ReviewedStingException("Asking for header line count when type is not an integer");
return count; return count;
} }

View File

@ -48,6 +48,7 @@ public final class VCFConstants {
public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods
public static final String GENOTYPE_POSTERIORS_KEY = "GP"; public static final String GENOTYPE_POSTERIORS_KEY = "GP";
public static final String GENOTYPE_QUALITY_KEY = "GQ"; public static final String GENOTYPE_QUALITY_KEY = "GQ";
public static final String GENOTYPE_ALLELE_DEPTHS = "AD";
public static final String HAPMAP2_KEY = "H2"; public static final String HAPMAP2_KEY = "H2";
public static final String HAPMAP3_KEY = "H3"; public static final String HAPMAP3_KEY = "H3";
public static final String HAPLOTYPE_QUALITY_KEY = "HQ"; public static final String HAPLOTYPE_QUALITY_KEY = "HQ";
@ -113,7 +114,5 @@ public final class VCFConstants {
public static final String EMPTY_GENOTYPE = "./."; public static final String EMPTY_GENOTYPE = "./.";
public static final int MAX_GENOTYPE_QUAL = 99; public static final int MAX_GENOTYPE_QUAL = 99;
public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
public static final String DOUBLE_PRECISION_INT_SUFFIX = ".00";
public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare
} }

Some files were not shown because too many files have changed in this diff Show More