1st draft of support for an file containing a list of intervals.

Appears to work, but inefficient:
At each reference location, the entire list of intervals is linear searched. 

Instead we need to have the intervals sorted, and simply seek forward from interval to interval.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@124 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
jmaguire 2009-03-21 16:07:32 +00:00
parent 1fcf4c0cbf
commit 0ea44a5805
2 changed files with 50 additions and 0 deletions

View File

@ -22,6 +22,9 @@ public class GenomeAnalysisTK extends CommandLineProgram {
@Option(shortName="R", doc="Reference sequence file", optional=true) public File REF_FILE_ARG = null;
@Option(shortName="B", doc="Debugging output", optional=true) public String DEBUGGING_STR = null;
@Option(shortName="L", doc="Genome region to operation on: from chr:start-end", optional=true) public String REGION_STR = null;
@Option(shortName="INT", doc="File containing list of genomic intervals to operate on. line := <contig> <start> <end>\n", optional=true) public String INTERVALS_FILE = null;
@Option(shortName="T", doc="Type of analysis to run") public String Analysis_Name = null;
@Option(shortName="DBSNP", doc="DBSNP file", optional=true) public String DBSNP_FILE = null;
@Option(shortName="THREADED_IO", doc="If true, enables threaded I/O operations", optional=true) public String ENABLED_THREADED_IO = "false";
@ -90,6 +93,11 @@ public class GenomeAnalysisTK extends CommandLineProgram {
engine.setLocation(REGION_STR);
}
if (INTERVALS_FILE != null)
{
engine.setLocationFromFile(INTERVALS_FILE);
}
engine.setSafetyChecking(! UNSAFE.toLowerCase().equals("true"));
engine.setSortOnFly(ENABLED_SORT_ON_FLY.toLowerCase().equals("true"));

View File

@ -144,6 +144,47 @@ public class TraversalEngine {
this.locs = parseGenomeLocs(locStr);
}
/**
* Read a file of genome locations to process.
* regions specified by the location string. The string is of the form:
* Of the form: loc1;loc2;...
* Where each locN can be:
* Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ
*
* @param file_name
*/
public void setLocationFromFile( final String file_name )
{
String locStr = "";
Scanner scanner = null;
try
{
scanner = new Scanner(new File(file_name));
while ( scanner.hasNextLine() )
{
String line = scanner.nextLine();
line.replaceAll("\n", "");
locStr += line;
if (scanner.hasNextLine()) { locStr += ";"; }
}
}
catch (Exception e)
{
e.printStackTrace();
System.exit(-1);
}
finally
{
//ensure the underlying stream is always closed
scanner.close();
}
System.out.format("DEBUG: locStr: %s\n", locStr);
this.locs = parseGenomeLocs(locStr);
}
/**
* Useful utility function that parses a location string into a coordinate-order sorted
* array of GenomeLoc objects
@ -503,6 +544,7 @@ public class TraversalEngine {
final LocusContext locus = iter.next();
// Poor man's version of index LOL
// HALP! I HAZ 10K INTERVALS 2 INDX
GenomeLoc curLoc = locus.getLocation();
if ( inLocations(curLoc) ) {
if ( prevLoc != null && curLoc.compareContigs(prevLoc) != 0 )