Added a bunch of changes to support the new MicroManager code

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@431 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-04-15 18:29:38 +00:00
parent 339261c4a9
commit 180ff13290
7 changed files with 142 additions and 85 deletions

View File

@ -7,28 +7,24 @@ import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.util.RuntimeIOException;
import org.apache.log4j.Logger;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.executive.MicroManager;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.rodGFF;
import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD;
import org.broadinstitute.sting.gatk.refdata.rodSAMPileup;
import org.broadinstitute.sting.gatk.traversals.*;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.traversals.*;
import org.broadinstitute.sting.gatk.executive.MicroManager;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import java.io.File;
import java.io.PrintStream;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
@ -332,8 +328,14 @@ public class GenomeAnalysisTK extends CommandLineProgram {
engine.initialize();
if( microManager != null ) {
List<GenomeLoc> locations = GenomeLoc.parseGenomeLocs( REGION_STR );
microManager.execute( my_walker, locations );
List<GenomeLoc> locs;
if (INTERVALS_FILE != null) {
locs = GenomeLoc.IntervalFileToList(INTERVALS_FILE);
microManager.setIntervalList(locs);
} else {
locs = GenomeLoc.parseGenomeLocs( REGION_STR );
}
microManager.execute( my_walker, locs );
}
else
engine.traverse(my_walker);

View File

@ -209,7 +209,7 @@ public abstract class LocusShardStrategy implements ShardStrategy {
// move to the next contig
// the next sequence should start at the begining of the next contig
Shard ret = LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + lastGenomeLocSize));
Shard ret = LocusShard.toShard(new GenomeLoc(dic.getSequence(seqLoc).getSequenceIndex(), nextStart, nextStart + lastGenomeLocSize - 1));
// now jump ahead to the next contig
jumpContig();

View File

@ -41,13 +41,27 @@ public class SAMDataSource implements SimpleDataSource {
private final List<File> samFileList = new ArrayList<File>();
/**
* constructor, given a single sam file
* constructor, given sam files
*
* @param samFiles the list of sam files
*/
public SAMDataSource(List<String> samFiles) throws SimpleDataSourceLoadException {
for (String fileName : samFiles) {
File smFile = new File(fileName);
public SAMDataSource(List<?> samFiles) throws SimpleDataSourceLoadException {
// check the length
if (samFiles.size() < 1) {
throw new SimpleDataSourceLoadException("SAMDataSource: you must provide a list of length greater then 0");
}
for (Object fileName : samFiles) {
File smFile;
if ( samFiles.get(0) instanceof String) {
smFile = new File((String)samFiles.get(0));
}
else if (samFiles.get(0) instanceof File) {
smFile = (File)fileName;
}
else {
throw new SimpleDataSourceLoadException("SAMDataSource: unknown samFile list type, must be String or File");
}
if (!smFile.canRead()) {
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + fileName);
}
@ -55,10 +69,12 @@ public class SAMDataSource implements SimpleDataSource {
}
//SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(samFileList, SORT_ORDER);
}
protected SAMFileReader initializeSAMFile(final File samFile) {
if (samFile.toString().endsWith(".list")) {
return null;
@ -67,7 +83,7 @@ public class SAMDataSource implements SimpleDataSource {
samReader.setValidationStringency(strictness);
final SAMFileHeader header = samReader.getFileHeader();
logger.info(String.format("Sort order is: " + header.getSortOrder()));
logger.debug(String.format("Sort order is: " + header.getSortOrder()));
return samReader;
}

View File

@ -1,28 +1,28 @@
package org.broadinstitute.sting.gatk.executive;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider;
import org.broadinstitute.sting.gatk.dataSources.providers.ReferenceProvider;
import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.dataSources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SAMDataSource;
import org.broadinstitute.sting.gatk.dataSources.simpleDataSources.SimpleDataSourceLoadException;
import org.broadinstitute.sting.gatk.iterators.MergingSamRecordIterator2;
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
import org.broadinstitute.sting.gatk.traversals.TraverseLociByReference;
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
import org.broadinstitute.sting.gatk.traversals.TraverseLociByReference;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import java.util.List;
import java.util.Arrays;
import java.util.Iterator;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* A micro-scheduling manager for N-way threaded execution of a traversal
@ -38,6 +38,8 @@ public class MicroManager {
protected static Logger logger = Logger.getLogger(MicroManager.class);
protected List<GenomeLoc> intervalList = null;
public TraversalEngine getTraversalEngine() {
return traversalEngine;
}
@ -53,6 +55,9 @@ public class MicroManager {
traversalEngine = new TraverseLociByReference( reads, refFile, new java.util.ArrayList() );
}
public void setIntervalList(List<GenomeLoc> intervalList) {
this.intervalList = intervalList;
}
public void execute( Walker walker, // the analysis technique to use.
List<GenomeLoc> locations ) { // list of work to do
@ -71,11 +76,25 @@ public class MicroManager {
SAMDataSource dataSource = null;
try {
dataSource = new SAMDataSource( Arrays.asList( new String[] { reads.getCanonicalPath() } ) );
// todo: remove this code when we acutally handle command line args of multiple bam files
ArrayList<File> fl = new ArrayList<File>();
if (reads.getName().endsWith(".list")) {
BufferedReader bis = new BufferedReader(new FileReader(reads));
String line = null;
while ((line = bis.readLine()) != null) {
if (!line.equals("")){
fl.add(new File(line));
}
}
} else {
fl.add(reads);
}
dataSource = new SAMDataSource( fl );
}
catch( SimpleDataSourceLoadException ex ) {
throw new RuntimeException( ex );
}
}
catch( IOException ex ) {
throw new RuntimeException( ex );
}
@ -83,7 +102,8 @@ public class MicroManager {
Object accumulator = ((LocusWalker<?,?>)walker).reduceInit();
for(Shard shard: shardStrategy) {
Iterator<SAMRecord> readShard = null;
// CloseableIterator<SAMRecord> readShard = null;
MergingSamRecordIterator2 readShard = null;
try {
readShard = dataSource.seek( shard.getGenomeLoc() );
}
@ -95,11 +115,12 @@ public class MicroManager {
LocusContextProvider locusProvider = new LocusContextProvider( readShard );
accumulator = traversalEngine.traverse( walker, shard, referenceProvider, locusProvider, accumulator );
readShard.close();
}
traversalEngine.printOnTraversalDone("loci", accumulator);
walker.onTraversalDone(accumulator);
traversalEngine.printOnTraversalDone("loci", accumulator);
walker.onTraversalDone(accumulator);
}
}

View File

@ -3,8 +3,6 @@ package org.broadinstitute.sting.gatk.traversals;
import edu.mit.broad.picard.filter.SamRecordFilter;
import edu.mit.broad.picard.reference.ReferenceSequence;
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
import edu.mit.broad.picard.directed.IntervalList;
import edu.mit.broad.picard.util.Interval;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileReader.ValidationStringency;
@ -12,15 +10,18 @@ import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.RuntimeIOException;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.iterators.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import java.io.*;
import java.util.*;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public abstract class TraversalEngine {
// list of reference ordered data objects
@ -191,32 +192,7 @@ public abstract class TraversalEngine {
*/
public void setLocationFromFile(final String file_name) {
// first try to read it as an interval file since that's well structured
// we'll fail quickly if it's not a valid file. Then try to parse it as
// a location string file
try {
IntervalList il = IntervalList.fromFile(new File(file_name));
// iterate through the list of merged intervals and add then as GenomeLocs
ArrayList<GenomeLoc> locList = new ArrayList<GenomeLoc>();
for(Interval interval : il.getUniqueIntervals()) {
locList.add(new GenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd()));
}
this.locs = locList;
} catch (Exception e) {
try {
xReadLines reader = new xReadLines(new File(file_name));
List<String> lines = reader.readLines();
reader.close();
String locStr = Utils.join(";", lines);
logger.debug("locStr: " + locStr);
setLocation(locStr);
} catch (Exception e2) {
e2.printStackTrace();
System.exit(-1);
}
}
this.locs = GenomeLoc.IntervalFileToList(file_name);
}
@ -456,7 +432,6 @@ public abstract class TraversalEngine {
* assumes you are accessing the data in order. You can't use this function for random access. Each
* successive call moves you along the file, consuming all data before loc.
*
* @param rodIters Iterators to access the RODs
* @param loc The location to get the rods at
* @return A list of ReferenceOrderDatum at loc. ROD without a datum at loc will be null in the list
*/

View File

@ -1,22 +1,24 @@
package org.broadinstitute.sting.utils;
import net.sf.functionalj.reflect.StdReflect;
import net.sf.functionalj.reflect.JdkStdReflect;
import net.sf.functionalj.FunctionN;
import edu.mit.broad.picard.directed.IntervalList;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import edu.mit.broad.picard.util.Interval;
import net.sf.functionalj.Function1;
import net.sf.functionalj.FunctionN;
import net.sf.functionalj.Functions;
import net.sf.functionalj.reflect.JdkStdReflect;
import net.sf.functionalj.reflect.StdReflect;
import net.sf.functionalj.util.Operators;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.SAMRecord;
import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import org.apache.log4j.Logger;
import java.io.File;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
@ -521,4 +523,45 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
//if ( this.getStop() > that.getStop() ) return 1;
return 0;
}
/**
* Read a file of genome locations to process.
* regions specified by the location string. The string is of the form:
* Of the form: loc1;loc2;...
* Where each locN can be:
* 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
*
* @param file_name
*/
public static ArrayList<GenomeLoc> IntervalFileToList(final String file_name) {
// first try to read it as an interval file since that's well structured
// we'll fail quickly if it's not a valid file. Then try to parse it as
// a location string file
ArrayList<GenomeLoc> ret = null;
try {
IntervalList il = IntervalList.fromFile(new File(file_name));
// iterate through the list of merged intervals and add then as GenomeLocs
ret = new ArrayList<GenomeLoc>();
for(Interval interval : il.getUniqueIntervals()) {
ret.add(new GenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd()));
}
return ret;
} catch (Exception e) {
try {
xReadLines reader = new xReadLines(new File(file_name));
List<String> lines = reader.readLines();
reader.close();
String locStr = Utils.join(";", lines);
logger.debug("locStr: " + locStr);
ret = parseGenomeLocs(locStr);
return ret;
} catch (Exception e2) {
e2.printStackTrace();
throw new IllegalArgumentException(e);
}
}
}
}

View File

@ -197,12 +197,12 @@ public class SAMBAMDataSourceTest extends BaseTest {
fail("testLinearBreakIterateAll: We Should get a SimpleDataSourceLoadException");
}
int pos = 0;
/*int pos = 0;
for (; pos < 100; pos++) {
if (!readcountPerShard.get(pos).equals(readcountPerShard2.get(pos))) {
fail("Shard number " + pos + " in the two approaches had different read counts");
fail("Shard number " + pos + " in the two approaches had different read counts, " + readcountPerShard.get(pos) + " and " + readcountPerShard2.get(pos));
}
}
} */
}