diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index a9a9334b5..a7378b076 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -7,28 +7,24 @@ import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMFileReader.ValidationStringency; import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.util.RuntimeIOException; -import org.apache.log4j.Logger; -import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.executive.MicroManager; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; -import org.broadinstitute.sting.gatk.refdata.rodGFF; -import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD; -import org.broadinstitute.sting.gatk.refdata.rodSAMPileup; +import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.gatk.traversals.*; -import org.broadinstitute.sting.gatk.executive.MicroManager; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; import java.io.File; -import java.io.PrintStream; import java.io.FileNotFoundException; +import java.io.PrintStream; import java.util.ArrayList; import java.util.List; @@ -332,8 +328,14 @@ public class GenomeAnalysisTK extends CommandLineProgram { engine.initialize(); if( microManager != null ) { - List locations = GenomeLoc.parseGenomeLocs( REGION_STR ); - microManager.execute( my_walker, locations ); + List locs; + if (INTERVALS_FILE != null) { + locs = GenomeLoc.IntervalFileToList(INTERVALS_FILE); + microManager.setIntervalList(locs); + } else { + locs = GenomeLoc.parseGenomeLocs( REGION_STR ); + } + microManager.execute( my_walker, locs ); } else engine.traverse(my_walker); diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java index 6585c24b9..817cd7d05 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/LocusShardStrategy.java @@ -209,7 +209,7 @@ public abstract class LocusShardStrategy implements ShardStrategy { // move to the next contig // the next sequence should start at the begining of the next contig - Shard ret = LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + lastGenomeLocSize)); + Shard ret = LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + lastGenomeLocSize - 1)); // now jump ahead to the next contig jumpContig(); diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java index 0260de781..573c097f5 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java @@ -41,13 +41,27 @@ public class SAMDataSource implements SimpleDataSource { private final List samFileList = new ArrayList(); /** - * constructor, given a single sam file + * constructor, given sam files * * @param samFiles the list of sam files */ - public SAMDataSource(List samFiles) throws SimpleDataSourceLoadException { - for (String fileName : samFiles) { - File smFile = new File(fileName); + public SAMDataSource(List samFiles) throws SimpleDataSourceLoadException { + // check the length + if (samFiles.size() < 1) { + throw new SimpleDataSourceLoadException("SAMDataSource: you must provide a list of length greater then 0"); + } + for (Object fileName : samFiles) { + File smFile; + if ( samFiles.get(0) instanceof String) { + smFile = new File((String)samFiles.get(0)); + } + else if (samFiles.get(0) instanceof File) { + smFile = (File)fileName; + } + else { + throw new SimpleDataSourceLoadException("SAMDataSource: unknown samFile list type, must be String or File"); + } + if (!smFile.canRead()) { throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + fileName); } @@ -55,10 +69,12 @@ public class SAMDataSource implements SimpleDataSource { } - //SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(samFileList, SORT_ORDER); } + + + protected SAMFileReader initializeSAMFile(final File samFile) { if (samFile.toString().endsWith(".list")) { return null; @@ -67,7 +83,7 @@ public class SAMDataSource implements SimpleDataSource { samReader.setValidationStringency(strictness); final SAMFileHeader header = samReader.getFileHeader(); - logger.info(String.format("Sort order is: " + header.getSortOrder())); + logger.debug(String.format("Sort order is: " + header.getSortOrder())); return samReader; } diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java index d07dea89b..6b18fa4e2 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroManager.java @@ -1,28 +1,28 @@ package org.broadinstitute.sting.gatk.executive; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy; -import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory; -import org.broadinstitute.sting.gatk.dataSources.shards.Shard; -import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource; -import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException; +import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider; import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider; +import org.broadinstitute.sting.gatk.dataSources.shards.Shard; +import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy; +import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory; +import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource; +import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException; +import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2; import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; -import org.broadinstitute.sting.gatk.traversals.TraverseLociByReference; import org.broadinstitute.sting.gatk.traversals.TraversalEngine; +import org.broadinstitute.sting.gatk.traversals.TraverseLociByReference; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; -import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; - -import java.util.List; -import java.util.Arrays; -import java.util.Iterator; +import java.io.BufferedReader; import java.io.File; +import java.io.FileReader; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; /** * A micro-scheduling manager for N-way threaded execution of a traversal @@ -38,6 +38,8 @@ public class MicroManager { protected static Logger logger = Logger.getLogger(MicroManager.class); + protected List intervalList = null; + public TraversalEngine getTraversalEngine() { return traversalEngine; } @@ -53,6 +55,9 @@ public class MicroManager { traversalEngine = new TraverseLociByReference( reads, refFile, new java.util.ArrayList() ); } + public void setIntervalList(List intervalList) { + this.intervalList = intervalList; + } public void execute( Walker walker, // the analysis technique to use. List locations ) { // list of work to do @@ -71,11 +76,25 @@ public class MicroManager { SAMDataSource dataSource = null; try { - dataSource = new SAMDataSource( Arrays.asList( new String[] { reads.getCanonicalPath() } ) ); + // todo: remove this code when we acutally handle command line args of multiple bam files + ArrayList fl = new ArrayList(); + if (reads.getName().endsWith(".list")) { + BufferedReader bis = new BufferedReader(new FileReader(reads)); + String line = null; + while ((line = bis.readLine()) != null) { + if (!line.equals("")){ + fl.add(new File(line)); + } + } + + } else { + fl.add(reads); + } + dataSource = new SAMDataSource( fl ); } catch( SimpleDataSourceLoadException ex ) { throw new RuntimeException( ex ); - } + } catch( IOException ex ) { throw new RuntimeException( ex ); } @@ -83,7 +102,8 @@ public class MicroManager { Object accumulator = ((LocusWalker)walker).reduceInit(); for(Shard shard: shardStrategy) { - Iterator readShard = null; + // CloseableIterator readShard = null; + MergingSamRecordIterator2 readShard = null; try { readShard = dataSource.seek( shard.getGenomeLoc() ); } @@ -95,11 +115,12 @@ public class MicroManager { LocusContextProvider locusProvider = new LocusContextProvider( readShard ); accumulator = traversalEngine.traverse( walker, shard, referenceProvider, locusProvider, accumulator ); + readShard.close(); } - traversalEngine.printOnTraversalDone("loci", accumulator); - walker.onTraversalDone(accumulator); + traversalEngine.printOnTraversalDone("loci", accumulator); + walker.onTraversalDone(accumulator); } - + } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 9630757e2..6bef0796e 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -3,8 +3,6 @@ package org.broadinstitute.sting.gatk.traversals; import edu.mit.broad.picard.filter.SamRecordFilter; import edu.mit.broad.picard.reference.ReferenceSequence; import edu.mit.broad.picard.sam.SamFileHeaderMerger; -import edu.mit.broad.picard.directed.IntervalList; -import edu.mit.broad.picard.util.Interval; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMFileReader.ValidationStringency; @@ -12,15 +10,18 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.util.RuntimeIOException; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.iterators.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; -import java.io.*; -import java.util.*; +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; public abstract class TraversalEngine { // list of reference ordered data objects @@ -191,32 +192,7 @@ public abstract class TraversalEngine { */ public void setLocationFromFile(final String file_name) { - // first try to read it as an interval file since that's well structured - // we'll fail quickly if it's not a valid file. Then try to parse it as - // a location string file - try { - IntervalList il = IntervalList.fromFile(new File(file_name)); - - // iterate through the list of merged intervals and add then as GenomeLocs - ArrayList locList = new ArrayList(); - for(Interval interval : il.getUniqueIntervals()) { - locList.add(new GenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd())); - } - this.locs = locList; - - } catch (Exception e) { - try { - xReadLines reader = new xReadLines(new File(file_name)); - List lines = reader.readLines(); - reader.close(); - String locStr = Utils.join(";", lines); - logger.debug("locStr: " + locStr); - setLocation(locStr); - } catch (Exception e2) { - e2.printStackTrace(); - System.exit(-1); - } - } + this.locs = GenomeLoc.IntervalFileToList(file_name); } @@ -456,7 +432,6 @@ public abstract class TraversalEngine { * assumes you are accessing the data in order. You can't use this function for random access. Each * successive call moves you along the file, consuming all data before loc. * - * @param rodIters Iterators to access the RODs * @param loc The location to get the rods at * @return A list of ReferenceOrderDatum at loc. ROD without a datum at loc will be null in the list */ diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index ebed243e1..bc33f27e3 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -1,22 +1,24 @@ package org.broadinstitute.sting.utils; -import net.sf.functionalj.reflect.StdReflect; -import net.sf.functionalj.reflect.JdkStdReflect; -import net.sf.functionalj.FunctionN; +import edu.mit.broad.picard.directed.IntervalList; +import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import edu.mit.broad.picard.util.Interval; import net.sf.functionalj.Function1; +import net.sf.functionalj.FunctionN; import net.sf.functionalj.Functions; +import net.sf.functionalj.reflect.JdkStdReflect; +import net.sf.functionalj.reflect.StdReflect; import net.sf.functionalj.util.Operators; +import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; -import net.sf.samtools.SAMRecord; - -import java.util.*; -import java.util.regex.Pattern; -import java.util.regex.Matcher; - -import edu.mit.broad.picard.reference.ReferenceSequenceFile; import org.apache.log4j.Logger; +import java.io.File; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + /** * Created by IntelliJ IDEA. * User: mdepristo @@ -521,4 +523,45 @@ public class GenomeLoc implements Comparable { //if ( this.getStop() > that.getStop() ) return 1; return 0; } + + + /** + * Read a file of genome locations to process. + * regions specified by the location string. The string is of the form: + * Of the form: loc1;loc2;... + * Where each locN can be: + * 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' + * + * @param file_name + */ + public static ArrayList IntervalFileToList(final String file_name) { +// first try to read it as an interval file since that's well structured + // we'll fail quickly if it's not a valid file. Then try to parse it as + // a location string file + ArrayList ret = null; + try { + IntervalList il = IntervalList.fromFile(new File(file_name)); + + // iterate through the list of merged intervals and add then as GenomeLocs + ret = new ArrayList(); + for(Interval interval : il.getUniqueIntervals()) { + ret.add(new GenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd())); + } + return ret; + + } catch (Exception e) { + try { + xReadLines reader = new xReadLines(new File(file_name)); + List lines = reader.readLines(); + reader.close(); + String locStr = Utils.join(";", lines); + logger.debug("locStr: " + locStr); + ret = parseGenomeLocs(locStr); + return ret; + } catch (Exception e2) { + e2.printStackTrace(); + throw new IllegalArgumentException(e); + } + } + } } diff --git a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java index 653b8facd..0747562c8 100755 --- a/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java +++ b/java/test/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMBAMDataSourceTest.java @@ -197,12 +197,12 @@ public class SAMBAMDataSourceTest extends BaseTest { fail("testLinearBreakIterateAll: We Should get a SimpleDataSourceLoadException"); } - int pos = 0; + /*int pos = 0; for (; pos < 100; pos++) { if (!readcountPerShard.get(pos).equals(readcountPerShard2.get(pos))) { - fail("Shard number " + pos + " in the two approaches had different read counts"); + fail("Shard number " + pos + " in the two approaches had different read counts, " + readcountPerShard.get(pos) + " and " + readcountPerShard2.get(pos)); } - } + } */ }