Various improvements and fixes:
In indel cleaner: 1. allow the user to specify that he wants to use Picardâs SAMFileWriter sorting on disk instead of having us sort in memory; this is useful if the input consists of long reads. 2. for N-way-out mode: output bams now use the original headers from the corresponding input bams - as opposed to the merged header. This entailed some reworking of the datasources code. 3. intermediate check-in of code that allows user to input known indels to be used as alternate consenses. Not done yet. In UG: fix bug in beagle output for Jared. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2805 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
57a168c0db
commit
ca1917507f
|
|
@ -377,13 +377,45 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a mapping from original input files to the SAMFileReaders
|
||||
*
|
||||
* @return the mapping
|
||||
*/
|
||||
public Map<File, SAMFileReader> getFileToReaderMapping() {
|
||||
return getDataSource().getFileToReaderMapping();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a mapping from original input files to their (merged) read group ids
|
||||
*
|
||||
* @return the mapping
|
||||
*/
|
||||
public Map<File, Set<String>> getFileToReadGroupIdMapping() {
|
||||
return getDataSource().getFileToReadGroupIdMapping();
|
||||
Map<File, SAMFileReader> fileToReaderMap = getFileToReaderMapping();
|
||||
|
||||
// populate the file -> read group mapping
|
||||
Map<File, Set<String>> fileToReadGroupIdMap = new HashMap<File, Set<String>>();
|
||||
for (Map.Entry<File, SAMFileReader> entry : fileToReaderMap.entrySet()) {
|
||||
|
||||
Set<String> readGroups = new HashSet<String>(5);
|
||||
|
||||
for (SAMReadGroupRecord g : entry.getValue().getFileHeader().getReadGroups()) {
|
||||
if (getDataSource().hasReadGroupCollisions()) {
|
||||
// Check if there were read group clashes.
|
||||
// If there were, use the SamFileHeaderMerger to translate from the
|
||||
// original read group id to the read group id in the merged stream
|
||||
readGroups.add(getDataSource().getReadGroupId(entry.getValue(), g.getReadGroupId()));
|
||||
} else {
|
||||
// otherwise, pass through the unmapped read groups since this is what Picard does as well
|
||||
readGroups.add(g.getReadGroupId());
|
||||
}
|
||||
}
|
||||
|
||||
fileToReadGroupIdMap.put(entry.getKey(), readGroups);
|
||||
}
|
||||
|
||||
return fileToReadGroupIdMap;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -113,12 +113,12 @@ public class IndexDrivenSAMDataSource extends SAMDataSource {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns a mapping from original input files to their (merged) read group ids
|
||||
* Returns a mapping from original input files to the SAMFileReaders
|
||||
*
|
||||
* @return the mapping
|
||||
*/
|
||||
public Map<File, Set<String>> getFileToReadGroupIdMapping() {
|
||||
return resourcePool.getFileToReadGroupIdMapping();
|
||||
public Map<File, SAMFileReader> getFileToReaderMapping() {
|
||||
return resourcePool.getFileToReaderMapping();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -77,7 +77,7 @@ class ReadStreamResource {
|
|||
/**
|
||||
* A mapping from original input file to merged read group record ids
|
||||
*/
|
||||
private Map<File, Set<String>> fileToReadGroupIdMap = null;
|
||||
private Map<File, SAMFileReader> fileToReaderMap = null;
|
||||
|
||||
public ReadStreamResource( Reads sourceInfo ) {
|
||||
SamFileHeaderMerger headerMerger = createHeaderMerger(sourceInfo, SAMFileHeader.SortOrder.coordinate);
|
||||
|
|
@ -142,8 +142,8 @@ class ReadStreamResource {
|
|||
return readStreamPointer.getReadsOverlapping(segment);
|
||||
}
|
||||
|
||||
public Map<File, Set<String>> getFileToReadGroupIdMapping() {
|
||||
return fileToReadGroupIdMap;
|
||||
public Map<File, SAMFileReader> getFileToReaderMapping() {
|
||||
return fileToReaderMap;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -159,7 +159,7 @@ class ReadStreamResource {
|
|||
|
||||
// right now this is pretty damn heavy, it copies the file list into a reader list every time
|
||||
List<SAMFileReader> lst = new ArrayList<SAMFileReader>();
|
||||
Map<File, SAMFileReader> fileToReaderMap = new HashMap<File, SAMFileReader>();
|
||||
fileToReaderMap = new HashMap<File, SAMFileReader>();
|
||||
for (File f : reads.getReadsFiles()) {
|
||||
SAMFileReader reader = new SAMFileReader(f, eagerDecode);
|
||||
fileToReaderMap.put(f, reader);
|
||||
|
|
@ -180,30 +180,6 @@ class ReadStreamResource {
|
|||
lst.add(reader);
|
||||
}
|
||||
|
||||
// create the header merger
|
||||
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(lst,SORT_ORDER,true);
|
||||
|
||||
// populate the file -> read group mapping
|
||||
fileToReadGroupIdMap = new HashMap<File, Set<String>>();
|
||||
for (Map.Entry<File, SAMFileReader> entry : fileToReaderMap.entrySet()) {
|
||||
|
||||
Set<String> readGroups = new HashSet<String>(5);
|
||||
|
||||
for (SAMReadGroupRecord g : entry.getValue().getFileHeader().getReadGroups()) {
|
||||
if (headerMerger.hasReadGroupCollisions()) {
|
||||
// Check if there were read group clashes.
|
||||
// If there were, use the SamFileHeaderMerger to translate from the
|
||||
// original read group id to the read group id in the merged stream
|
||||
readGroups.add(headerMerger.getReadGroupId(entry.getValue(), g.getReadGroupId()));
|
||||
} else {
|
||||
// otherwise, pass through the unmapped read groups since this is what Picard does as well
|
||||
readGroups.add(g.getReadGroupId());
|
||||
}
|
||||
}
|
||||
|
||||
fileToReadGroupIdMap.put(entry.getKey(), readGroups);
|
||||
}
|
||||
|
||||
return headerMerger;
|
||||
return new SamFileHeaderMerger(lst,SORT_ORDER,true);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,7 +13,6 @@ import org.broadinstitute.sting.utils.sam.SAMReadViolationHistogram;
|
|||
|
||||
import java.io.File;
|
||||
import java.util.Collection;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
|
||||
/*
|
||||
|
|
@ -118,7 +117,7 @@ public abstract class SAMDataSource implements SimpleDataSource {
|
|||
*
|
||||
* @return the mapping
|
||||
*/
|
||||
public Map<File, Set<String>> getFileToReadGroupIdMapping() { return null; }
|
||||
public Map<File, SAMFileReader> getFileToReaderMapping() { return null; }
|
||||
|
||||
/**
|
||||
* Returns readers used by this data source.
|
||||
|
|
|
|||
|
|
@ -32,9 +32,9 @@ import org.apache.log4j.Logger;
|
|||
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.io.File;
|
||||
|
||||
|
|
@ -52,7 +52,7 @@ class SAMResourcePool extends ResourcePool<ReadStreamResource, StingSAMIterator>
|
|||
/** Source information about the reads. */
|
||||
protected Reads reads;
|
||||
protected SamFileHeaderMerger headerMerger;
|
||||
protected Map<File, Set<String>> fileToReadGroupIdMap;
|
||||
protected Map<File, SAMFileReader> fileToReaderMap;
|
||||
|
||||
/**
|
||||
* Do all the constituent BAM files have indices? We support some very limited
|
||||
|
|
@ -74,7 +74,7 @@ class SAMResourcePool extends ResourcePool<ReadStreamResource, StingSAMIterator>
|
|||
this.header = streamResource.getHeader();
|
||||
this.headerMerger = streamResource.getHeaderMerger();
|
||||
this.hasIndex = streamResource.hasIndex();
|
||||
this.fileToReadGroupIdMap = streamResource.getFileToReadGroupIdMapping();
|
||||
this.fileToReaderMap = streamResource.getFileToReaderMapping();
|
||||
|
||||
// Add this resource to the pool.
|
||||
this.addNewResource(streamResource);
|
||||
|
|
@ -93,12 +93,12 @@ class SAMResourcePool extends ResourcePool<ReadStreamResource, StingSAMIterator>
|
|||
public Reads getReadsInfo() { return reads; }
|
||||
|
||||
/**
|
||||
* Returns a mapping from original input files to their (merged) read group ids
|
||||
* Returns a mapping from original input files to the SAMFileReaders
|
||||
*
|
||||
* @return the mapping
|
||||
*/
|
||||
public Map<File, Set<String>> getFileToReadGroupIdMapping() {
|
||||
return fileToReadGroupIdMap;
|
||||
public Map<File, SAMFileReader> getFileToReaderMapping() {
|
||||
return fileToReaderMap;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -138,11 +138,11 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul
|
|||
}
|
||||
double[] likelihoods = gl.getLikelihoods();
|
||||
beagleWriter.print(' ');
|
||||
beagleWriter.print(String.format("%.6f", Math.pow(10, likelihoods[GenotypeType.REF.ordinal()])));
|
||||
beagleWriter.print(String.format("%.6f", Math.pow(10, likelihoods[refGenotype.ordinal()])));
|
||||
beagleWriter.print(' ');
|
||||
beagleWriter.print(String.format("%.6f", Math.pow(10, likelihoods[GenotypeType.HET.ordinal()])));
|
||||
beagleWriter.print(String.format("%.6f", Math.pow(10, likelihoods[hetGenotype.ordinal()])));
|
||||
beagleWriter.print(' ');
|
||||
beagleWriter.print(String.format("%.6f", Math.pow(10, likelihoods[GenotypeType.HOM.ordinal()])));
|
||||
beagleWriter.print(String.format("%.6f", Math.pow(10, likelihoods[homGenotype.ordinal()])));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.indels;
|
|||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
|
|
@ -35,11 +36,17 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
protected boolean NWAY_OUTPUT = true;
|
||||
|
||||
@Argument(fullName="outputSuffix", shortName="suffix", required=false, doc="Suffix to append to output bams (when using --NwayOutput) [default:'.cleaned']")
|
||||
protected String outputSuffix = ".cleaned";
|
||||
protected String outputSuffix = "cleaned";
|
||||
|
||||
@Argument(fullName="bam_compression", shortName="compress", required=false, doc="Compression level to use for output bams [default:5]")
|
||||
protected Integer compressionLevel = 5;
|
||||
|
||||
@Argument(fullName="sortOnDisk", shortName="sortOnDisk", required=false, doc="Should we sort on disk instead of on the fly? This option is much slower but should be used when on-the-fly sorting fails because reads are too long [default:no]")
|
||||
protected boolean SORT_ON_DISK = false;
|
||||
|
||||
@Argument(fullName="knownIndels", shortName="knownIndels", required=false, doc="One or more rod triplets <binding,type,path> of known indels to try for alternate consenses; types must implement VariationRod")
|
||||
protected ArrayList<String> knownIndels = new ArrayList<String>();
|
||||
|
||||
// ADVANCED OPTIONS FOLLOW
|
||||
|
||||
@Argument(fullName="outputIndels", shortName="indels", required=false, doc="Output file (text) for the indels found")
|
||||
|
|
@ -73,9 +80,14 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
// the reads that fall into the current interval
|
||||
private ReadBin readsToClean = new ReadBin();
|
||||
private ArrayList<SAMRecord> readsNotToClean = new ArrayList<SAMRecord>();
|
||||
private TreeSet<VariationRod> knownIndelsToTry = new TreeSet<VariationRod>(new Comparator<VariationRod>(){
|
||||
public int compare(VariationRod rod1, VariationRod rod2) {
|
||||
return (int)(rod1.getLocation().getStart() - rod2.getLocation().getStart());
|
||||
}
|
||||
});
|
||||
|
||||
// the wrapper around the SAM writer
|
||||
private Map<String, SortingSAMFileWriter> writers = null;
|
||||
private Map<String, SAMFileWriter> writers = null;
|
||||
|
||||
// random number generator
|
||||
private Random generator;
|
||||
|
|
@ -110,21 +122,24 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
|
||||
// set up the output writer(s)
|
||||
if ( baseWriterFilename != null ) {
|
||||
writers = new HashMap<String, SortingSAMFileWriter>();
|
||||
writers = new HashMap<String, SAMFileWriter>();
|
||||
Map<File, Set<String>> readGroupMap = getToolkit().getFileToReadGroupIdMapping();
|
||||
SAMFileWriterFactory factory = new SAMFileWriterFactory();
|
||||
|
||||
if ( NWAY_OUTPUT ) {
|
||||
for ( File file : readGroupMap.keySet() ) {
|
||||
Map<File, SAMFileReader> readerMap = getToolkit().getFileToReaderMapping();
|
||||
for ( File file : readerMap.keySet() ) {
|
||||
String newFileName = file.getName().substring(0, file.getName().length()-3) + outputSuffix + ".bam";
|
||||
SAMFileWriter baseWriter = factory.makeBAMWriter(getToolkit().getSAMFileHeader(), true, new File(baseWriterFilename, newFileName), compressionLevel);
|
||||
SortingSAMFileWriter writer = new SortingSAMFileWriter(baseWriter, SORTING_WRITER_WINDOW);
|
||||
SAMFileWriter writer = factory.makeBAMWriter(readerMap.get(file).getFileHeader(), !SORT_ON_DISK, new File(baseWriterFilename, newFileName), compressionLevel);
|
||||
if ( !SORT_ON_DISK )
|
||||
writer = new SortingSAMFileWriter(writer, SORTING_WRITER_WINDOW);
|
||||
for ( String rg : readGroupMap.get(file) )
|
||||
writers.put(rg, writer);
|
||||
}
|
||||
} else {
|
||||
SAMFileWriter baseWriter = factory.makeBAMWriter(getToolkit().getSAMFileHeader(), true, new File(baseWriterFilename), compressionLevel);
|
||||
SortingSAMFileWriter writer = new SortingSAMFileWriter(baseWriter, SORTING_WRITER_WINDOW);
|
||||
SAMFileWriter writer = factory.makeBAMWriter(getToolkit().getSAMFileHeader(), !SORT_ON_DISK, new File(baseWriterFilename), compressionLevel);
|
||||
if ( !SORT_ON_DISK )
|
||||
writer = new SortingSAMFileWriter(writer, SORTING_WRITER_WINDOW);
|
||||
for ( Set<String> set : readGroupMap.values() ) {
|
||||
for ( String rg : set )
|
||||
writers.put(rg, writer);
|
||||
|
|
@ -135,6 +150,22 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
// set up the random generator
|
||||
generator = new Random(RANDOM_SEED);
|
||||
|
||||
// set up the rods (since this is a ReadWalker we don't get rods from the traversal)
|
||||
List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum>>();
|
||||
ReferenceOrderedData.parseBindings(knownIndels, rods);
|
||||
for ( ReferenceOrderedData<? extends ReferenceOrderedDatum> rod : rods ) {
|
||||
if ( !(rod instanceof VariationRod) )
|
||||
continue;
|
||||
SeekableRODIterator<? extends ReferenceOrderedDatum> iter = rod.iterator();
|
||||
while ( iter.hasNext() ) {
|
||||
RODRecordList<? extends ReferenceOrderedDatum> records = iter.next();
|
||||
for ( ReferenceOrderedDatum record : records ) {
|
||||
if (((VariationRod)record).isIndel())
|
||||
knownIndelsToTry.add((VariationRod)record);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( OUT_INDELS != null ) {
|
||||
try {
|
||||
indelOutput = new FileWriter(new File(OUT_INDELS));
|
||||
|
|
@ -182,8 +213,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
return;
|
||||
|
||||
// break out the reads into sets for their respective writers
|
||||
Map<SortingSAMFileWriter, Set<SAMRecord>> bins = new HashMap<SortingSAMFileWriter, Set<SAMRecord>>();
|
||||
for ( SortingSAMFileWriter writer : writers.values() )
|
||||
Map<SAMFileWriter, Set<SAMRecord>> bins = new HashMap<SAMFileWriter, Set<SAMRecord>>();
|
||||
for ( SAMFileWriter writer : writers.values() )
|
||||
bins.put(writer, new HashSet<SAMRecord>());
|
||||
|
||||
for ( SAMRecord read : reads ) {
|
||||
|
|
@ -197,8 +228,15 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
for ( Map.Entry<SortingSAMFileWriter, Set<SAMRecord>> entry : bins.entrySet() )
|
||||
entry.getKey().addAlignments(entry.getValue());
|
||||
for ( Map.Entry<SAMFileWriter, Set<SAMRecord>> entry : bins.entrySet() ) {
|
||||
if ( !SORT_ON_DISK ) {
|
||||
// we can be efficient in this case by batching the reads all together
|
||||
((SortingSAMFileWriter)entry.getKey()).addAlignments(entry.getValue());
|
||||
} else {
|
||||
for ( SAMRecord read : entry.getValue() )
|
||||
entry.getKey().addAlignment(read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Integer map(char[] ref, SAMRecord read) {
|
||||
|
|
@ -237,6 +275,9 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
else { // the read is past the current interval
|
||||
|
||||
// TODO - NOW WE NEED TO GET THE APPROPRIATE KNOWN INDELS FOR THIS REGION
|
||||
|
||||
clean(readsToClean);
|
||||
|
||||
// merge the two sets for emission
|
||||
|
|
@ -272,13 +313,14 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
if ( writers != null ) {
|
||||
for ( SortingSAMFileWriter writer : writers.values() ) {
|
||||
writer.close();
|
||||
|
||||
// TODO -- fix me
|
||||
for ( SAMFileWriter writer : writers.values() ) {
|
||||
// TODO -- figure out why we're getting an exception thrown here
|
||||
// TODO -- because we need to call close() to flush out the remaining reads from the writer
|
||||
try {
|
||||
writer.getBaseWriter().close();
|
||||
} catch (net.sf.samtools.util.RuntimeIOException e) {}
|
||||
writer.close();
|
||||
if ( !SORT_ON_DISK )
|
||||
((SortingSAMFileWriter)writer).getBaseWriter().close();
|
||||
} catch (RuntimeException e) {}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -191,7 +191,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Test
|
||||
public void testOtherOutput() {
|
||||
String[] md5s = {"78482125d51f9eb2ee850a6b25921e84", "cf39a4a90e9f01b6f1183e6b1fd7520e"};
|
||||
String[] md5s = {"78482125d51f9eb2ee850a6b25921e84", "8cba0b8752f18fc620b4697840bc7291"};
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper" +
|
||||
" -R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
|
|
|
|||
Loading…
Reference in New Issue