diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 89a39fe33..1e99c7770 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -378,12 +378,23 @@ public class GenomeAnalysisEngine { } /** + * Returns a mapping from original input files to their (merged) read group ids + * + * @return the mapping + */ + public Map> getFileToReadGroupIdMapping() { + return getDataSource().getFileToReadGroupIdMapping(); + } + + /** + * **** UNLESS YOU HAVE GOOD REASON TO, DO NOT USE THIS METHOD; USE getFileToReadGroupIdMapping() INSTEAD **** + * * Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list * returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups * (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file. * - * @return + * @return sets of (merged) read group ids in order of input bams */ public List> getMergedReadGroupsByReaders() { diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java index c3a506cbe..56b2df28d 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/IndexDrivenSAMDataSource.java @@ -4,6 +4,7 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMFileReader; import net.sf.samtools.util.CloseableIterator; +import net.sf.picard.sam.SamFileHeaderMerger; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.datasources.shards.MonolithicShard; @@ -16,8 +17,8 @@ import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram; -import java.util.Collection; -import java.util.Collections; +import java.util.*; +import java.io.File; /* * Copyright (c) 2009 The Broad Institute @@ -111,6 +112,14 @@ public class IndexDrivenSAMDataSource extends SAMDataSource { return resourcePool.getHeader(); } + /** + * Returns a mapping from original input files to their (merged) read group ids + * + * @return the mapping + */ + public Map> getFileToReadGroupIdMapping() { + return resourcePool.getFileToReadGroupIdMapping(); + } /** * Returns Reads data structure containing information about the reads data sources placed in this pool as well as diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java index f11169a3e..20e8a7a9b 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamResource.java @@ -33,8 +33,7 @@ import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMReadGroupRecord; import net.sf.picard.sam.SamFileHeaderMerger; -import java.util.List; -import java.util.ArrayList; +import java.util.*; import java.io.File; /** @@ -75,6 +74,11 @@ class ReadStreamResource { */ private ReadStreamPointer readStreamPointer = null; + /** + * A mapping from original input file to merged read group record ids + */ + private Map> fileToReadGroupIdMap = null; + public ReadStreamResource( Reads sourceInfo ) { SamFileHeaderMerger headerMerger = createHeaderMerger(sourceInfo, SAMFileHeader.SortOrder.coordinate); @@ -104,7 +108,7 @@ class ReadStreamResource { /** * Returns Reads data structure containing information about the reads data sources as well as * information about how they are downsampled, sorted, and filtered - * @return + * @return the Reads object */ public Reads getReadsInfo() { return readStreamPointer.getReadsInfo(); } @@ -112,7 +116,7 @@ class ReadStreamResource { * Returns header merger: a class that keeps the mapping between original read groups and read groups * of the merged stream; merger also provides access to the individual file readers (and hence headers * too) maintained by the system. - * @return + * @return the header merger */ public SamFileHeaderMerger getHeaderMerger() { return readStreamPointer.getHeaderMerger(); } @@ -134,11 +138,14 @@ class ReadStreamResource { return readStreamPointer.getReadsContainedBy(segment); } - public StingSAMIterator getReadsOverlapping( DataStreamSegment segment ) { return readStreamPointer.getReadsOverlapping(segment); } + public Map> getFileToReadGroupIdMapping() { + return fileToReadGroupIdMap; + } + /** * A private function that, given the internal file list, generates a merging construct for * all available files. @@ -149,10 +156,13 @@ class ReadStreamResource { */ private SamFileHeaderMerger createHeaderMerger( Reads reads, SAMFileHeader.SortOrder SORT_ORDER ) throws SimpleDataSourceLoadException { + // right now this is pretty damn heavy, it copies the file list into a reader list every time List lst = new ArrayList(); + Map fileToReaderMap = new HashMap(); for (File f : reads.getReadsFiles()) { SAMFileReader reader = new SAMFileReader(f, eagerDecode); + fileToReaderMap.put(f, reader); reader.setValidationStringency(reads.getValidationStringency()); final SAMFileHeader header = reader.getFileHeader(); @@ -169,6 +179,31 @@ class ReadStreamResource { lst.add(reader); } - return new SamFileHeaderMerger(lst,SORT_ORDER,true); + + // create the header merger + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst,SORT_ORDER,true); + + // populate the file -> read group mapping + fileToReadGroupIdMap = new HashMap>(); + for (Map.Entry entry : fileToReaderMap.entrySet()) { + + Set readGroups = new HashSet(5); + + for (SAMReadGroupRecord g : entry.getValue().getFileHeader().getReadGroups()) { + if (headerMerger.hasReadGroupCollisions()) { + // Check if there were read group clashes. + // If there were, use the SamFileHeaderMerger to translate from the + // original read group id to the read group id in the merged stream + readGroups.add(headerMerger.getReadGroupId(entry.getValue(), g.getReadGroupId())); + } else { + // otherwise, pass through the unmapped read groups since this is what Picard does as well + readGroups.add(g.getReadGroupId()); + } + } + + fileToReadGroupIdMap.put(entry.getKey(), readGroups); + } + + return headerMerger; } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index b8b366e4a..da04745d3 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -13,6 +13,8 @@ import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram; import java.io.File; import java.util.Collection; +import java.util.Set; +import java.util.Map; /* * Copyright (c) 2009 The Broad Institute @@ -111,6 +113,13 @@ public abstract class SAMDataSource implements SimpleDataSource { */ public Reads getReadsInfo() { return reads; } + /** + * Returns a mapping from original input files to their (merged) read group ids + * + * @return the mapping + */ + public Map> getFileToReadGroupIdMapping() { return null; } + /** * Returns readers used by this data source. */ diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java index b10c73adf..5df61de57 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMResourcePool.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.Reads; -import org.broadinstitute.sting.utils.StingException; import org.apache.log4j.Logger; import net.sf.picard.sam.SamFileHeaderMerger; @@ -35,6 +34,9 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import java.util.List; +import java.util.Set; +import java.util.Map; +import java.io.File; /** * Maintain a pool of resources of accessors to SAM read data. SAMFileReaders and @@ -50,6 +52,7 @@ class SAMResourcePool extends ResourcePool /** Source information about the reads. */ protected Reads reads; protected SamFileHeaderMerger headerMerger; + protected Map> fileToReadGroupIdMap; /** * Do all the constituent BAM files have indices? We support some very limited @@ -71,7 +74,8 @@ class SAMResourcePool extends ResourcePool this.header = streamResource.getHeader(); this.headerMerger = streamResource.getHeaderMerger(); this.hasIndex = streamResource.hasIndex(); - + this.fileToReadGroupIdMap = streamResource.getFileToReadGroupIdMapping(); + // Add this resource to the pool. this.addNewResource(streamResource); } @@ -88,13 +92,22 @@ class SAMResourcePool extends ResourcePool */ public Reads getReadsInfo() { return reads; } - /** + /** + * Returns a mapping from original input files to their (merged) read group ids + * + * @return the mapping + */ + public Map> getFileToReadGroupIdMapping() { + return fileToReadGroupIdMap; + } + + /** * Returns header merger: a class that keeps the mapping between original read groups and read groups * of the merged stream; merger also provides access to the individual file readers (and hence headers * too) maintained by the system. * @return */ - public SamFileHeaderMerger getHeaderMerger() { return headerMerger; } + public SamFileHeaderMerger getHeaderMerger() { return headerMerger; } protected ReadStreamResource selectBestExistingResource( DataStreamSegment segment, List resources ) { for (ReadStreamResource resource : resources) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index c69eb4c77..c465882f2 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -24,30 +24,46 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) protected double LOD_THRESHOLD = 5.0; - @Argument(fullName="EntropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) + + @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) protected double MISMATCH_THRESHOLD = 0.15; - @Argument(fullName="OutputBam", shortName="O", required=false, doc="Output bam") - protected SAMFileWriter baseWriter = null; + @Argument(fullName="output", shortName="O", required=false, doc="Output bam (or directory if using --NwayOutput)") + protected String baseWriterFilename = null; + @Argument(fullName="NWayOutput", shortName="nway", required=false, doc="Should the reads be emitted in a separate bam file for each one of the input bams? [default:yes]") + protected boolean NWAY_OUTPUT = true; + + @Argument(fullName="outputSuffix", shortName="suffix", required=false, doc="Suffix to append to output bams (when using --NwayOutput) [default:'.cleaned']") + protected String outputSuffix = ".cleaned"; + + @Argument(fullName="bam_compression", shortName="compress", required=false, doc="Compression level to use for output bams [default:5]") + protected Integer compressionLevel = 5; + + // ADVANCED OPTIONS FOLLOW + + @Argument(fullName="outputIndels", shortName="indels", required=false, doc="Output file (text) for the indels found") + protected String OUT_INDELS = null; - @Argument(fullName="OutputIndels", shortName="indels", required=false, doc="Output file (text) for the indels found") - String OUT_INDELS = null; @Argument(fullName="statisticsFile", shortName="stats", doc="print out statistics (what does or doesn't get cleaned)", required=false) - String OUT_STATS = null; + protected String OUT_STATS = null; + @Argument(fullName="SNPsFile", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out", required=false) - String OUT_SNPS = null; + protected String OUT_SNPS = null; + @Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false) - int MAX_CONSENSUSES = 30; + protected int MAX_CONSENSUSES = 30; + @Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false) - int MAX_READS_FOR_CONSENSUSES = 120; + protected int MAX_READS_FOR_CONSENSUSES = 120; @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment", required=false) - int MAX_READS = 20000; + protected int MAX_READS = 20000; @Argument(fullName="writerWindowSize", shortName="writerWindowSize", doc="the window over which the writer will store reads", required=false) protected int SORTING_WRITER_WINDOW = 100; + // the intervals input by the user private Iterator intervals = null; @@ -59,7 +75,7 @@ public class IndelRealigner extends ReadWalker { private ArrayList readsNotToClean = new ArrayList(); // the wrapper around the SAM writer - private SortingSAMFileWriter writer = null; + private Map writers = null; // random number generator private Random generator; @@ -87,13 +103,36 @@ public class IndelRealigner extends ReadWalker { if ( MISMATCH_THRESHOLD <= 0.0 || MISMATCH_THRESHOLD > 1.0 ) throw new RuntimeException("Entropy threshold must be a fraction between 0 and 1"); + // read in the intervals for cleaning List locs = GenomeAnalysisEngine.parseIntervalRegion(Arrays.asList(intervalsFile)); intervals = GenomeLocSortedSet.createSetFromList(locs).iterator(); currentInterval = intervals.hasNext() ? intervals.next() : null; - if ( baseWriter != null ) - writer = new SortingSAMFileWriter(baseWriter, SORTING_WRITER_WINDOW); + // set up the output writer(s) + if ( baseWriterFilename != null ) { + writers = new HashMap(); + Map> readGroupMap = getToolkit().getFileToReadGroupIdMapping(); + SAMFileWriterFactory factory = new SAMFileWriterFactory(); + if ( NWAY_OUTPUT ) { + for ( File file : readGroupMap.keySet() ) { + String newFileName = file.getName().substring(0, file.getName().length()-3) + outputSuffix + ".bam"; + SAMFileWriter baseWriter = factory.makeBAMWriter(getToolkit().getSAMFileHeader(), true, new File(baseWriterFilename, newFileName), compressionLevel); + SortingSAMFileWriter writer = new SortingSAMFileWriter(baseWriter, SORTING_WRITER_WINDOW); + for ( String rg : readGroupMap.get(file) ) + writers.put(rg, writer); + } + } else { + SAMFileWriter baseWriter = factory.makeBAMWriter(getToolkit().getSAMFileHeader(), true, new File(baseWriterFilename), compressionLevel); + SortingSAMFileWriter writer = new SortingSAMFileWriter(baseWriter, SORTING_WRITER_WINDOW); + for ( Set set : readGroupMap.values() ) { + for ( String rg : set ) + writers.put(rg, writer); + } + } + } + + // set up the random generator generator = new Random(RANDOM_SEED); if ( OUT_INDELS != null ) { @@ -126,13 +165,40 @@ public class IndelRealigner extends ReadWalker { } private void emit(SAMRecord read) { - if ( writer != null ) - writer.addAlignment(read); + if ( writers != null ) { + SAMReadGroupRecord readGroup = read.getReadGroup(); + if ( readGroup == null || readGroup.getReadGroupId() == null ) { + if ( writers.size() > 1 ) + throw new StingException("There are multiple output writers but read " + read.toString() + " has no read group"); + writers.values().iterator().next().addAlignment(read); + } else { + writers.get(readGroup.getReadGroupId()).addAlignment(read); + } + } } private void emit(List reads) { - if ( writer != null ) - writer.addAlignments(reads); + if ( writers == null ) + return; + + // break out the reads into sets for their respective writers + Map> bins = new HashMap>(); + for ( SortingSAMFileWriter writer : writers.values() ) + bins.put(writer, new HashSet()); + + for ( SAMRecord read : reads ) { + SAMReadGroupRecord readGroup = read.getReadGroup(); + if ( readGroup == null || readGroup.getReadGroupId() == null ) { + if ( writers.size() > 1 ) + throw new StingException("There are multiple output writers but read " + read.toString() + " has no read group"); + bins.get(writers.values().iterator().next()).add(read); + } else { + bins.get(writers.get(readGroup.getReadGroupId())).add(read); + } + } + + for ( Map.Entry> entry : bins.entrySet() ) + entry.getKey().addAlignments(entry.getValue()); } public Integer map(char[] ref, SAMRecord read) { @@ -146,7 +212,7 @@ public class IndelRealigner extends ReadWalker { if ( readLoc.getStop() == 0 ) readLoc = GenomeLocParser.createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); - if ( readLoc.isBefore(currentInterval) ) { + if ( readLoc.isBefore(currentInterval) || Utils.is454Read(read) ) { emit(read); return 0; } @@ -155,8 +221,7 @@ public class IndelRealigner extends ReadWalker { read.getDuplicateReadFlag() || read.getNotPrimaryAlignmentFlag() || read.getMappingQuality() == 0 || - read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || - Utils.is454Read(read) ) { + read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ) { readsNotToClean.add(read); } else { readsToClean.add(read, ref); @@ -206,8 +271,16 @@ public class IndelRealigner extends ReadWalker { emit(readsNotToClean); } - if ( writer != null ) - writer.close(); + if ( writers != null ) { + for ( SortingSAMFileWriter writer : writers.values() ) { + writer.close(); + + // TODO -- fix me + try { + writer.getBaseWriter().close(); + } catch (net.sf.samtools.util.RuntimeIOException e) {} + } + } if ( OUT_INDELS != null ) { try { @@ -905,7 +978,6 @@ public class IndelRealigner extends ReadWalker { private Cigar newCigar = null; private int newStart = -1; private int mismatchScoreToReference; - private boolean updated = false; public AlignedRead(SAMRecord read) { this.read = read; @@ -978,14 +1050,9 @@ public class IndelRealigner extends ReadWalker { read.setCigar(newCigar); read.setAlignmentStart(newStart); } - updated = true; return true; } - public boolean wasUpdated() { - return updated; - } - public void setMismatchScoreToReference(int score) { mismatchScoreToReference = score; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java index 748aeea00..70bbafcae 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/SortingSAMFileWriter.java @@ -4,7 +4,7 @@ import net.sf.samtools.*; import java.util.TreeSet; import java.util.Iterator; -import java.util.List; +import java.util.Collection; /** * @author ebanks @@ -66,14 +66,14 @@ public class SortingSAMFileWriter implements SAMFileWriter { /** * Add a list of reads to the writer for emission; the reads do NOT need to be sorted * - * @param reads the reads to emit + * @param reads the reads to emit */ - public void addAlignments(List reads) { + public void addAlignments(Collection reads) { if ( reads.size() == 0 ) return; // at a new contig, clear the cache - if ( cachedReads.size() > 0 && cachedReads.first().getReferenceIndex() < reads.get(0).getReferenceIndex() ) + if ( cachedReads.size() > 0 && cachedReads.first().getReferenceIndex() < reads.iterator().next().getReferenceIndex() ) clearCache(); cachedReads.addAll(reads); @@ -96,19 +96,26 @@ public class SortingSAMFileWriter implements SAMFileWriter { } /** - * get the SAM file header + * @return the SAM file header */ public SAMFileHeader getFileHeader() { return baseWriter.getFileHeader(); } /** - * close this writer by clearing the cache + * close this writer by clearing the cache (but DO NOT close the underlying SAMFileWriter) */ public void close() { clearCache(); } + /** + * @return the underlying SAMFileWriter + */ + public SAMFileWriter getBaseWriter() { + return baseWriter; + } + private void clearCache() { Iterator iter = cachedReads.iterator(); while ( iter.hasNext() ) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/VCFValidator.java b/java/src/org/broadinstitute/sting/gatk/walkers/qc/VCFValidator.java index 70f56c292..10936d1de 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/qc/VCFValidator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/qc/VCFValidator.java @@ -15,7 +15,7 @@ import org.broadinstitute.sting.utils.StingException; /** * A light-weight validator for a VCF file. */ -@Requires(value={},referenceMetaData=@RMD(name="vcf",type= RodVCF.class)) +@Requires(value={},referenceMetaData=@RMD(name="vcf", type=RodVCF.class)) public class VCFValidator extends RodWalker { /** @@ -28,7 +28,7 @@ public class VCFValidator extends RodWalker { RODRecordList rodlist = tracker.getTrackData("vcf", null); if ( rodlist != null ) { RodVCF rod = (RodVCF)rodlist.getRecords().get(0); - if ( (rod.isSNP() || rod.isReference()) && rod.getReference().charAt(0) != ref.getBase() ) + if ( (rod.isSNP() || rod.isReference()) && Character.toUpperCase(rod.getReference().charAt(0)) != Character.toUpperCase(ref.getBase()) ) throw new StingException("The reference base (" + ref.getBase() + ") does not match the base from the VCF record (" + rod.getReference() + ")"); } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java index 4abd305ee..3060d959f 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -484,8 +484,8 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { } public void setQual(double qual) { - if (qual < 0) - throw new IllegalArgumentException("Qual values must be greater than 0"); + if ( qual < 0 && MathUtils.compareDoubles(qual, -1.0) != 0 ) + throw new IllegalArgumentException("Qual values cannot be negative unless they are -1 ('unknown')"); mQual = qual; }