2009-03-16 06:21:48 +08:00
package org.broadinstitute.sting.gatk ;
2009-02-27 05:50:29 +08:00
2009-03-23 05:06:22 +08:00
import edu.mit.broad.picard.reference.ReferenceSequence ;
import edu.mit.broad.picard.reference.ReferenceSequenceFile ;
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory ;
2009-03-11 03:34:00 +08:00
import net.sf.samtools.SAMFileReader.ValidationStringency ;
2009-03-22 20:04:11 +08:00
import net.sf.samtools.SAMSequenceRecord ;
2009-03-23 03:57:52 +08:00
import net.sf.samtools.util.RuntimeIOException ;
2009-03-23 05:06:22 +08:00
import org.apache.log4j.Logger ;
2009-03-16 06:21:48 +08:00
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData ;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP ;
import org.broadinstitute.sting.gatk.refdata.rodGFF ;
2009-03-23 05:06:22 +08:00
import org.broadinstitute.sting.gatk.walkers.LocusWalker ;
import org.broadinstitute.sting.gatk.walkers.ReadWalker ;
2009-03-25 08:12:00 +08:00
import org.broadinstitute.sting.gatk.walkers.Walker ;
2009-03-23 03:57:52 +08:00
import org.broadinstitute.sting.utils.FastaSequenceFile2 ;
2009-03-23 05:06:22 +08:00
import org.broadinstitute.sting.utils.GenomeLoc ;
import org.broadinstitute.sting.utils.Utils ;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram ;
2009-02-27 05:50:29 +08:00
2009-03-23 05:06:22 +08:00
import java.io.File ;
2009-03-23 03:57:52 +08:00
import java.util.ArrayList ;
2009-03-23 05:06:22 +08:00
import java.util.HashMap ;
import java.util.List ;
2009-02-27 05:50:29 +08:00
2009-03-16 06:21:48 +08:00
public class GenomeAnalysisTK extends CommandLineProgram {
2009-03-20 06:12:25 +08:00
2009-03-23 05:06:22 +08:00
// parameters and their defaults
public File INPUT_FILE ;
public String MAX_READS_ARG = "-1" ;
public String STRICTNESS_ARG = "strict" ;
public File REF_FILE_ARG = null ;
public String DEBUGGING_STR = null ;
public String REGION_STR = null ;
public String Analysis_Name = null ;
public String DBSNP_FILE = null ;
2009-03-24 11:58:03 +08:00
public String HAPMAP_FILE = null ;
2009-03-23 05:06:22 +08:00
public Boolean ENABLED_THREADED_IO = false ;
public Boolean UNSAFE = false ;
public Boolean ENABLED_SORT_ON_FLY = false ;
public String INTERVALS_FILE = null ;
// our walker manager
2009-03-20 06:12:25 +08:00
private WalkerManager walkerManager = null ;
2009-03-25 08:12:00 +08:00
private Walker my_walker = null ;
2009-03-23 05:06:22 +08:00
public String pluginPathName = null ;
2009-02-27 05:50:29 +08:00
private TraversalEngine engine = null ;
public boolean DEBUGGING = false ;
2009-03-23 05:06:22 +08:00
/ * *
* our log , which we want to capture anything from this class
* /
private static Logger logger = Logger . getLogger ( GenomeAnalysisTK . class ) ;
/ * *
* setup our arguments , both required and optional
* < p / >
* Flags don ' t take an argument , the associated Boolean gets set to true if the flag appears on the command line .
* /
protected void setupArgs ( ) {
2009-03-25 08:12:00 +08:00
m_parser . addRequiredArg ( "input_file" , "I" , "SAM or BAM file for validation" , "INPUT_FILE" ) ;
2009-03-23 05:06:22 +08:00
m_parser . addOptionalArg ( "maximum_reads" , "M" , "Maximum number of reads to process before exiting" , "MAX_READS_ARG" ) ;
m_parser . addOptionalArg ( "validation_strictness" , "S" , "How strict should we be with validation" , "STRICTNESS_ARG" ) ;
m_parser . addOptionalArg ( "reference_sequence" , "R" , "Reference sequence file" , "REF_FILE_ARG" ) ;
m_parser . addOptionalArg ( "genome_region" , "L" , "Genome region to operation on: from chr:start-end" , "REGION_STR" ) ;
2009-03-25 08:12:00 +08:00
m_parser . addRequiredArg ( "analysis_type" , "T" , "Type of analysis to run" , "Analysis_Name" ) ;
2009-03-23 05:06:22 +08:00
m_parser . addOptionalArg ( "DBSNP" , "D" , "DBSNP file" , "DBSNP_FILE" ) ;
2009-03-24 11:58:03 +08:00
m_parser . addOptionalArg ( "Hapmap" , "H" , "Hapmap file" , "HAPMAP_FILE" ) ;
2009-03-23 23:01:32 +08:00
m_parser . addOptionalFlag ( "threaded_IO" , "P" , "If set, enables threaded I/O operations" , "ENABLED_THREADED_IO" ) ;
m_parser . addOptionalFlag ( "unsafe" , "U" , "If set, enables unsafe operations, nothing will be checked at runtime." , "UNSAFE" ) ;
m_parser . addOptionalFlag ( "sort_on_the_fly" , "F" , "If set, enables on fly sorting of reads file." , "ENABLED_SORT_ON_FLY" ) ;
2009-03-23 05:06:22 +08:00
m_parser . addOptionalArg ( "intervals_file" , "V" , "File containing list of genomic intervals to operate on. line := <contig> <start> <end>" , "INTERVALS_FILE" ) ;
}
2009-03-25 08:12:00 +08:00
/ * *
* GATK can add arguments dynamically based on analysis type .
* @return true
* /
@Override
protected boolean canAddArgumentsDynamically ( ) { return true ; }
/ * *
* GATK provides the walker as an argument source . As a side - effect , initializes the walker variable .
* @return List of walkers to load dynamically .
* /
@Override
protected Object [ ] getArgumentSources ( ) {
if ( Analysis_Name = = null )
throw new IllegalArgumentException ( "Must provide analysis name" ) ;
walkerManager = new WalkerManager ( pluginPathName ) ;
if ( ! walkerManager . doesWalkerExist ( Analysis_Name ) )
throw new IllegalArgumentException ( "Invalid analysis name" ) ;
my_walker = walkerManager . getWalkerByName ( Analysis_Name ) ;
return new Object [ ] { my_walker } ;
}
2009-03-23 05:06:22 +08:00
/ * *
* Required main method implementation .
* /
2009-02-27 05:50:29 +08:00
public static void main ( String [ ] argv ) {
2009-03-23 05:06:22 +08:00
start ( new GenomeAnalysisTK ( ) , argv ) ;
2009-02-27 05:50:29 +08:00
}
2009-03-23 05:06:22 +08:00
protected int execute ( ) {
2009-03-01 04:47:48 +08:00
final boolean TEST_ROD = false ;
2009-03-24 11:58:03 +08:00
List < ReferenceOrderedData > rods = new ArrayList < ReferenceOrderedData > ( ) ;
2009-02-28 01:07:57 +08:00
2009-03-23 05:06:22 +08:00
if ( TEST_ROD ) {
ReferenceOrderedData gff = new ReferenceOrderedData ( new File ( "trunk/data/gFFTest.gff" ) , rodGFF . class ) ;
2009-03-01 04:47:48 +08:00
gff . testMe ( ) ;
//ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class );
2009-03-23 05:06:22 +08:00
ReferenceOrderedData dbsnp = new ReferenceOrderedData ( new File ( "/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt" ) , rodDbSNP . class ) ;
2009-03-01 04:47:48 +08:00
//dbsnp.testMe();
2009-03-24 11:58:03 +08:00
rods . add ( dbsnp ) ; // { gff, dbsnp };
2009-03-23 05:06:22 +08:00
} else if ( DBSNP_FILE ! = null ) {
ReferenceOrderedData dbsnp = new ReferenceOrderedData ( new File ( DBSNP_FILE ) , rodDbSNP . class ) ;
2009-03-02 02:27:32 +08:00
//dbsnp.testMe();
2009-03-24 11:58:03 +08:00
rods . add ( dbsnp ) ; // { gff, dbsnp };
}
if ( HAPMAP_FILE ! = null ) {
ReferenceOrderedData gff = new ReferenceOrderedData ( new File ( HAPMAP_FILE ) , rodGFF . class ) ;
rods . add ( gff ) ;
2009-03-01 04:47:48 +08:00
}
this . engine = new TraversalEngine ( INPUT_FILE , REF_FILE_ARG , rods ) ;
2009-03-14 00:00:23 +08:00
2009-03-22 20:04:11 +08:00
// Prepare the sort ordering w.r.t. the sequence dictionary
2009-03-23 05:06:22 +08:00
if ( REF_FILE_ARG ! = null ) {
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory . getReferenceSequenceFile ( REF_FILE_ARG ) ;
List < SAMSequenceRecord > refContigs = refFile . getSequenceDictionary ( ) . getSequences ( ) ;
HashMap < String , Integer > refContigOrdering = new HashMap < String , Integer > ( ) ;
int i = 0 ;
for ( SAMSequenceRecord contig : refContigs ) {
refContigOrdering . put ( contig . getSequenceName ( ) , i ) ;
i + + ;
}
2009-03-22 20:04:11 +08:00
2009-03-23 05:06:22 +08:00
GenomeLoc . setContigOrdering ( refContigOrdering ) ;
}
2009-02-27 06:15:41 +08:00
ValidationStringency strictness ;
2009-03-23 05:06:22 +08:00
if ( STRICTNESS_ARG = = null ) {
2009-02-27 05:50:29 +08:00
strictness = ValidationStringency . STRICT ;
2009-03-23 05:06:22 +08:00
} else if ( STRICTNESS_ARG . toLowerCase ( ) . equals ( "lenient" ) ) {
strictness = ValidationStringency . LENIENT ;
} else if ( STRICTNESS_ARG . toLowerCase ( ) . equals ( "silent" ) ) {
strictness = ValidationStringency . SILENT ;
} else {
2009-02-27 05:50:29 +08:00
strictness = ValidationStringency . STRICT ;
2009-03-23 05:06:22 +08:00
}
logger . info ( "Strictness is " + strictness ) ;
2009-02-27 05:50:29 +08:00
engine . setStrictness ( strictness ) ;
2009-03-23 05:06:22 +08:00
engine . setDebugging ( ! ( DEBUGGING_STR = = null | | DEBUGGING_STR . toLowerCase ( ) . equals ( "true" ) ) ) ;
2009-02-27 05:50:29 +08:00
engine . setMaxReads ( Integer . parseInt ( MAX_READS_ARG ) ) ;
2009-03-23 05:06:22 +08:00
if ( REGION_STR ! = null ) {
2009-03-03 02:18:48 +08:00
engine . setLocation ( REGION_STR ) ;
}
2009-03-23 05:06:22 +08:00
if ( INTERVALS_FILE ! = null ) {
2009-03-22 00:07:32 +08:00
engine . setLocationFromFile ( INTERVALS_FILE ) ;
}
2009-03-23 05:06:22 +08:00
engine . setSafetyChecking ( ! UNSAFE ) ;
engine . setSortOnFly ( ENABLED_SORT_ON_FLY ) ;
2009-03-25 06:32:45 +08:00
engine . setThreadedIO ( ENABLED_THREADED_IO ) ;
engine . initialize ( ) ;
2009-03-16 06:21:48 +08:00
//engine.testReference();
2009-03-23 03:57:52 +08:00
2009-02-27 06:15:41 +08:00
//LocusWalker<Integer,Integer> walker = new PileupWalker();
2009-03-04 08:15:35 +08:00
2009-03-25 08:12:00 +08:00
if ( my_walker = = null )
throw new RuntimeException ( "Sanity check failed -- no walker present." ) ;
2009-03-04 08:15:35 +08:00
2009-03-25 08:12:00 +08:00
// Try to get the walker specified
2009-02-27 06:15:41 +08:00
try {
2009-03-23 05:06:22 +08:00
LocusWalker < ? , ? > walker = ( LocusWalker < ? , ? > ) my_walker ;
2009-03-26 00:01:58 +08:00
engine . traverseByLoci ( walker ) ;
2009-02-27 06:15:41 +08:00
}
2009-03-23 05:06:22 +08:00
catch ( java . lang . ClassCastException e ) {
2009-02-27 06:15:41 +08:00
// I guess we're a read walker LOL
2009-03-23 05:06:22 +08:00
ReadWalker < ? , ? > walker = ( ReadWalker < ? , ? > ) my_walker ;
2009-02-27 06:15:41 +08:00
engine . traverseByRead ( walker ) ;
}
2009-02-27 05:50:29 +08:00
return 0 ;
}
2009-03-23 03:57:52 +08:00
/ * *
* An inappropriately placed validation and performance testing routine for jumping
* around in the fasta sequence file .
* @param refFileName
* /
private static void testNewReferenceFeatures ( final File refFileName ) {
final FastaSequenceFile2 refFile = new FastaSequenceFile2 ( refFileName ) ;
Utils . setupRefContigOrdering ( refFile ) ;
List < SAMSequenceRecord > refContigs = refFile . getSequenceDictionary ( ) . getSequences ( ) ;
/ *
for ( SAMSequenceRecord refContig : refContigs ) {
System . out . printf ( " Traversing from chr1 to %s would require jumping %d bytes%n" ,
refContig . getSequenceName ( ) , refFile . getDistanceBetweenContigs ( "chr1" , refContig . getSequenceName ( ) ) ) ;
}
* /
String lastContig = null ;
List < Double > timings = new ArrayList < Double > ( ) ;
for ( SAMSequenceRecord startContig : refFile . getSequenceDictionary ( ) . getSequences ( ) ) {
final String startContigName = startContig . getSequenceName ( ) ;
for ( SAMSequenceRecord targetContig : refFile . getSequenceDictionary ( ) . getSequences ( ) ) {
refFile . seekToContig ( startContigName , true ) ;
System . out . printf ( "Seeking: current=%s, target=%s%n" , startContigName , targetContig . getSequenceName ( ) ) ;
long lastTime = System . currentTimeMillis ( ) ;
final boolean success = refFile . seekToContig ( targetContig . getSequenceName ( ) , true ) ;
long curTime = System . currentTimeMillis ( ) ;
final double elapsed = ( curTime - lastTime ) / 1000.0 ;
timings . add ( elapsed ) ;
System . out . printf ( " -> Elapsed time %.2f, averaging %.2f sec / seek for %d seeks%n" ,
elapsed , Utils . averageDouble ( timings ) , timings . size ( ) ) ;
if ( ! success ) {
System . out . printf ( "Failured to seek to %s from %s%n" , targetContig . getSequenceName ( ) , lastContig ) ;
}
//System.exit(1);
}
}
System . exit ( 1 ) ;
// code for randomly sampling the seeks
// Random rnd = new Random();
// String lastContig = null;
// List<Double> timings = new ArrayList<Double>();
// final int N_SAMPLES = 1000;
// //try { refFile.seekToContig("chr3"); } catch ( IOException e ) {}
// for ( int i = 0; i < N_SAMPLES; i++ ) {
// final int nextIndex = rnd.nextInt(refContigs.size());
// String nextContig = refFile.getSequenceDictionary().getSequence(nextIndex).getSequenceName();
// //nextContig = "chr2";
// try {
// System.out.printf("Seeking: current=%s, target=%s%n", refFile.getContigName(), nextContig);
// long lastTime = System.currentTimeMillis();
// final boolean success = refFile.seekToContig(nextContig, true);
// long curTime = System.currentTimeMillis();
// final double elapsed = (curTime - lastTime) / 1000.0;
// timings.add(elapsed);
// System.out.printf(" -> Elapsed time %.2f, averaging %.2f sec / seek for %d seeks%n",
// elapsed, Utils.averageDouble(timings), timings.size());
//
// if ( ! success ) {
// System.out.printf("Failured to seek to %s from %s%n", nextContig, lastContig );
// }
// //System.exit(1);
// } catch ( IOException e ) {
// System.out.printf("Failured to seek to %s from %s%n", nextContig, lastContig );
// e.printStackTrace();
// }
//
// lastContig = nextContig;
// }
// System.exit(1);
/ *
final String targetChr = "chr10" ;
try {
refFile . seekToContig ( targetChr ) ;
} catch ( IOException e ) {
System . out . printf ( "Failured to seek to %s%n" , targetChr ) ;
2009-03-23 05:06:22 +08:00
e . printStackTrace ( ) ;
2009-03-23 03:57:52 +08:00
}
System . exit ( 1 ) ;
* /
//List<Double> timings = new ArrayList<Double>();
final long startTime = System . currentTimeMillis ( ) ;
long lastTime = System . currentTimeMillis ( ) ;
int i = 0 ;
String prevNextContigName = null ;
System . out . printf ( "Walking reference sequence:%n" ) ;
for ( SAMSequenceRecord refContig : refContigs ) {
long curTime = System . currentTimeMillis ( ) ;
ReferenceSequence contig = refFile . nextSequence ( ) ;
final double elapsed = ( curTime - lastTime ) / 1000.0 ;
timings . add ( elapsed ) ;
System . out . printf ( "%2d : expected %s contig, found %s with next of %s after %.2f seconds, average is %.2f%n" , i ,
refContig . getSequenceName ( ) , contig . getName ( ) , refFile . getNextContigName ( ) , elapsed , Utils . averageDouble ( timings ) ) ;
if ( prevNextContigName ! = null & & contig . getName ( ) ! = null & & ! prevNextContigName . equals ( contig . getName ( ) ) )
throw new RuntimeIOException ( String . format ( "Unexpected contig ordering %s was expected next, but I found %s?" ,
prevNextContigName , contig . getName ( ) ) ) ;
prevNextContigName = refFile . getNextContigName ( ) ;
lastTime = curTime ;
i + + ;
System . out . printf ( " Traversing from chr1 to %s would require jumping %d bytes%n" ,
contig . getName ( ) , refFile . getDistanceBetweenContigs ( "chr1" , contig . getName ( ) ) ) ;
}
}
2009-03-04 08:15:35 +08:00
}