Added a bunch of changes to support the new MicroManager code
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@431 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
339261c4a9
commit
180ff13290
|
|
@ -7,28 +7,24 @@ import net.sf.samtools.SAMFileReader;
|
|||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.commons.cli.OptionBuilder;
|
||||
import org.apache.commons.cli.Option;
|
||||
import org.apache.commons.cli.OptionBuilder;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.executive.MicroManager;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodGFF;
|
||||
import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodSAMPileup;
|
||||
import org.broadinstitute.sting.gatk.traversals.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.traversals.*;
|
||||
import org.broadinstitute.sting.gatk.executive.MicroManager;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
|
@ -332,8 +328,14 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
engine.initialize();
|
||||
|
||||
if( microManager != null ) {
|
||||
List<GenomeLoc> locations = GenomeLoc.parseGenomeLocs( REGION_STR );
|
||||
microManager.execute( my_walker, locations );
|
||||
List<GenomeLoc> locs;
|
||||
if (INTERVALS_FILE != null) {
|
||||
locs = GenomeLoc.IntervalFileToList(INTERVALS_FILE);
|
||||
microManager.setIntervalList(locs);
|
||||
} else {
|
||||
locs = GenomeLoc.parseGenomeLocs( REGION_STR );
|
||||
}
|
||||
microManager.execute( my_walker, locs );
|
||||
}
|
||||
else
|
||||
engine.traverse(my_walker);
|
||||
|
|
|
|||
|
|
@ -209,7 +209,7 @@ public abstract class LocusShardStrategy implements ShardStrategy {
|
|||
|
||||
// move to the next contig
|
||||
// the next sequence should start at the begining of the next contig
|
||||
Shard ret = LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + lastGenomeLocSize));
|
||||
Shard ret = LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + lastGenomeLocSize - 1));
|
||||
|
||||
// now jump ahead to the next contig
|
||||
jumpContig();
|
||||
|
|
|
|||
|
|
@ -41,13 +41,27 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
private final List<File> samFileList = new ArrayList<File>();
|
||||
|
||||
/**
|
||||
* constructor, given a single sam file
|
||||
* constructor, given sam files
|
||||
*
|
||||
* @param samFiles the list of sam files
|
||||
*/
|
||||
public SAMDataSource(List<String> samFiles) throws SimpleDataSourceLoadException {
|
||||
for (String fileName : samFiles) {
|
||||
File smFile = new File(fileName);
|
||||
public SAMDataSource(List<?> samFiles) throws SimpleDataSourceLoadException {
|
||||
// check the length
|
||||
if (samFiles.size() < 1) {
|
||||
throw new SimpleDataSourceLoadException("SAMDataSource: you must provide a list of length greater then 0");
|
||||
}
|
||||
for (Object fileName : samFiles) {
|
||||
File smFile;
|
||||
if ( samFiles.get(0) instanceof String) {
|
||||
smFile = new File((String)samFiles.get(0));
|
||||
}
|
||||
else if (samFiles.get(0) instanceof File) {
|
||||
smFile = (File)fileName;
|
||||
}
|
||||
else {
|
||||
throw new SimpleDataSourceLoadException("SAMDataSource: unknown samFile list type, must be String or File");
|
||||
}
|
||||
|
||||
if (!smFile.canRead()) {
|
||||
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + fileName);
|
||||
}
|
||||
|
|
@ -55,10 +69,12 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
|
||||
}
|
||||
|
||||
//SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(samFileList, SORT_ORDER);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
protected SAMFileReader initializeSAMFile(final File samFile) {
|
||||
if (samFile.toString().endsWith(".list")) {
|
||||
return null;
|
||||
|
|
@ -67,7 +83,7 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
samReader.setValidationStringency(strictness);
|
||||
|
||||
final SAMFileHeader header = samReader.getFileHeader();
|
||||
logger.info(String.format("Sort order is: " + header.getSortOrder()));
|
||||
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
|
||||
|
||||
return samReader;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,28 +1,28 @@
|
|||
package org.broadinstitute.sting.gatk.executive;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
|
||||
import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
|
||||
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException;
|
||||
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseLociByReference;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseLociByReference;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A micro-scheduling manager for N-way threaded execution of a traversal
|
||||
|
|
@ -38,6 +38,8 @@ public class MicroManager {
|
|||
|
||||
protected static Logger logger = Logger.getLogger(MicroManager.class);
|
||||
|
||||
protected List<GenomeLoc> intervalList = null;
|
||||
|
||||
public TraversalEngine getTraversalEngine() {
|
||||
return traversalEngine;
|
||||
}
|
||||
|
|
@ -53,6 +55,9 @@ public class MicroManager {
|
|||
traversalEngine = new TraverseLociByReference( reads, refFile, new java.util.ArrayList() );
|
||||
}
|
||||
|
||||
public void setIntervalList(List<GenomeLoc> intervalList) {
|
||||
this.intervalList = intervalList;
|
||||
}
|
||||
|
||||
public void execute( Walker walker, // the analysis technique to use.
|
||||
List<GenomeLoc> locations ) { // list of work to do
|
||||
|
|
@ -71,11 +76,25 @@ public class MicroManager {
|
|||
SAMDataSource dataSource = null;
|
||||
|
||||
try {
|
||||
dataSource = new SAMDataSource( Arrays.asList( new String[] { reads.getCanonicalPath() } ) );
|
||||
// todo: remove this code when we acutally handle command line args of multiple bam files
|
||||
ArrayList<File> fl = new ArrayList<File>();
|
||||
if (reads.getName().endsWith(".list")) {
|
||||
BufferedReader bis = new BufferedReader(new FileReader(reads));
|
||||
String line = null;
|
||||
while ((line = bis.readLine()) != null) {
|
||||
if (!line.equals("")){
|
||||
fl.add(new File(line));
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
fl.add(reads);
|
||||
}
|
||||
dataSource = new SAMDataSource( fl );
|
||||
}
|
||||
catch( SimpleDataSourceLoadException ex ) {
|
||||
throw new RuntimeException( ex );
|
||||
}
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new RuntimeException( ex );
|
||||
}
|
||||
|
|
@ -83,7 +102,8 @@ public class MicroManager {
|
|||
Object accumulator = ((LocusWalker<?,?>)walker).reduceInit();
|
||||
|
||||
for(Shard shard: shardStrategy) {
|
||||
Iterator<SAMRecord> readShard = null;
|
||||
// CloseableIterator<SAMRecord> readShard = null;
|
||||
MergingSamRecordIterator2 readShard = null;
|
||||
try {
|
||||
readShard = dataSource.seek( shard.getGenomeLoc() );
|
||||
}
|
||||
|
|
@ -95,11 +115,12 @@ public class MicroManager {
|
|||
LocusContextProvider locusProvider = new LocusContextProvider( readShard );
|
||||
|
||||
accumulator = traversalEngine.traverse( walker, shard, referenceProvider, locusProvider, accumulator );
|
||||
readShard.close();
|
||||
}
|
||||
|
||||
traversalEngine.printOnTraversalDone("loci", accumulator);
|
||||
walker.onTraversalDone(accumulator);
|
||||
traversalEngine.printOnTraversalDone("loci", accumulator);
|
||||
walker.onTraversalDone(accumulator);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,8 +3,6 @@ package org.broadinstitute.sting.gatk.traversals;
|
|||
import edu.mit.broad.picard.filter.SamRecordFilter;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
|
||||
import edu.mit.broad.picard.directed.IntervalList;
|
||||
import edu.mit.broad.picard.util.Interval;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMFileReader.ValidationStringency;
|
||||
|
|
@ -12,15 +10,18 @@ import net.sf.samtools.SAMRecord;
|
|||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.iterators.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
public abstract class TraversalEngine {
|
||||
// list of reference ordered data objects
|
||||
|
|
@ -191,32 +192,7 @@ public abstract class TraversalEngine {
|
|||
*/
|
||||
public void setLocationFromFile(final String file_name) {
|
||||
|
||||
// first try to read it as an interval file since that's well structured
|
||||
// we'll fail quickly if it's not a valid file. Then try to parse it as
|
||||
// a location string file
|
||||
try {
|
||||
IntervalList il = IntervalList.fromFile(new File(file_name));
|
||||
|
||||
// iterate through the list of merged intervals and add then as GenomeLocs
|
||||
ArrayList<GenomeLoc> locList = new ArrayList<GenomeLoc>();
|
||||
for(Interval interval : il.getUniqueIntervals()) {
|
||||
locList.add(new GenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd()));
|
||||
}
|
||||
this.locs = locList;
|
||||
|
||||
} catch (Exception e) {
|
||||
try {
|
||||
xReadLines reader = new xReadLines(new File(file_name));
|
||||
List<String> lines = reader.readLines();
|
||||
reader.close();
|
||||
String locStr = Utils.join(";", lines);
|
||||
logger.debug("locStr: " + locStr);
|
||||
setLocation(locStr);
|
||||
} catch (Exception e2) {
|
||||
e2.printStackTrace();
|
||||
System.exit(-1);
|
||||
}
|
||||
}
|
||||
this.locs = GenomeLoc.IntervalFileToList(file_name);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -456,7 +432,6 @@ public abstract class TraversalEngine {
|
|||
* assumes you are accessing the data in order. You can't use this function for random access. Each
|
||||
* successive call moves you along the file, consuming all data before loc.
|
||||
*
|
||||
* @param rodIters Iterators to access the RODs
|
||||
* @param loc The location to get the rods at
|
||||
* @return A list of ReferenceOrderDatum at loc. ROD without a datum at loc will be null in the list
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -1,22 +1,24 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import net.sf.functionalj.reflect.StdReflect;
|
||||
import net.sf.functionalj.reflect.JdkStdReflect;
|
||||
import net.sf.functionalj.FunctionN;
|
||||
import edu.mit.broad.picard.directed.IntervalList;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
||||
import edu.mit.broad.picard.util.Interval;
|
||||
import net.sf.functionalj.Function1;
|
||||
import net.sf.functionalj.FunctionN;
|
||||
import net.sf.functionalj.Functions;
|
||||
import net.sf.functionalj.reflect.JdkStdReflect;
|
||||
import net.sf.functionalj.reflect.StdReflect;
|
||||
import net.sf.functionalj.util.Operators;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
|
|
@ -521,4 +523,45 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
|
|||
//if ( this.getStop() > that.getStop() ) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Read a file of genome locations to process.
|
||||
* regions specified by the location string. The string is of the form:
|
||||
* Of the form: loc1;loc2;...
|
||||
* Where each locN can be:
|
||||
* 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
|
||||
*
|
||||
* @param file_name
|
||||
*/
|
||||
public static ArrayList<GenomeLoc> IntervalFileToList(final String file_name) {
|
||||
// first try to read it as an interval file since that's well structured
|
||||
// we'll fail quickly if it's not a valid file. Then try to parse it as
|
||||
// a location string file
|
||||
ArrayList<GenomeLoc> ret = null;
|
||||
try {
|
||||
IntervalList il = IntervalList.fromFile(new File(file_name));
|
||||
|
||||
// iterate through the list of merged intervals and add then as GenomeLocs
|
||||
ret = new ArrayList<GenomeLoc>();
|
||||
for(Interval interval : il.getUniqueIntervals()) {
|
||||
ret.add(new GenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd()));
|
||||
}
|
||||
return ret;
|
||||
|
||||
} catch (Exception e) {
|
||||
try {
|
||||
xReadLines reader = new xReadLines(new File(file_name));
|
||||
List<String> lines = reader.readLines();
|
||||
reader.close();
|
||||
String locStr = Utils.join(";", lines);
|
||||
logger.debug("locStr: " + locStr);
|
||||
ret = parseGenomeLocs(locStr);
|
||||
return ret;
|
||||
} catch (Exception e2) {
|
||||
e2.printStackTrace();
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -197,12 +197,12 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
|||
fail("testLinearBreakIterateAll: We Should get a SimpleDataSourceLoadException");
|
||||
}
|
||||
|
||||
int pos = 0;
|
||||
/*int pos = 0;
|
||||
for (; pos < 100; pos++) {
|
||||
if (!readcountPerShard.get(pos).equals(readcountPerShard2.get(pos))) {
|
||||
fail("Shard number " + pos + " in the two approaches had different read counts");
|
||||
fail("Shard number " + pos + " in the two approaches had different read counts, " + readcountPerShard.get(pos) + " and " + readcountPerShard2.get(pos));
|
||||
}
|
||||
}
|
||||
} */
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue