2009-05-13 04:24:18 +08:00
package org.broadinstitute.sting.secondarybase ;
2009-05-13 03:46:34 +08:00
2009-05-19 01:42:08 +08:00
import net.sf.samtools.* ;
2009-05-20 08:07:24 +08:00
import net.sf.samtools.util.CloseableIterator ;
2009-05-22 04:36:12 +08:00
import org.broadinstitute.sting.utils.* ;
2009-05-15 02:58:43 +08:00
import org.broadinstitute.sting.utils.cmdLine.Argument ;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram ;
import java.io.File ;
2009-05-20 08:07:24 +08:00
import java.io.IOException ;
import java.util.ArrayList ;
import java.util.regex.Matcher ;
2009-05-22 03:40:47 +08:00
import java.util.regex.Pattern ;
2009-05-13 03:46:34 +08:00
2009-05-16 05:22:24 +08:00
/ * *
* AnnotateSecondaryBase computes the second best base for every base in an Illumina lane .
* First , a statistical model is fit to a subset of the raw Illumina intensities ( i . e . those
* generated by Illumina ' s "Firecrest" package ) . Then , every read ' s set of raw intensities
* is evaluated against this model to determine the base probability distribution of a given
* base observation .
*
* Approximately 95 % of the time , this method and Illumina ' s basecalling package , "Bustard" ,
2009-05-22 03:40:47 +08:00
* agree on the identity of the best base . In these cases , we simply annotate our estimate
* of the second - best base . In cases where this method and Bustard disagree , we annotate
* the secondary base as this method ' s primary base .
2009-05-16 05:22:24 +08:00
*
* @author Kiran Garimella
* /
2009-05-22 03:40:47 +08:00
/ *
An example invocation :
java - Xmx2048m - Djava . io . tmpdir = / broad / hptmp / - jar / path / to / AnnotateSecondaryBase . jar \
- D / seq / solexaproc2 / SL - XAX / analyzed / 0 90217_ SL - XAX_0003_FC30R47AAXX / Data / C1 - 152_ Firecrest1 .3 .2_25 - 02 - 2009_ prodinfo / Bustard1 .3 .2_25 - 02 - 2009_ prodinfo / \
- L 5 \
- B 30 R47AAXX090217
- CR ' 0 - 75 , 76 - 151 ' \
- SO ~ / test . sam \
- SI / seq / picard / 30 R47AAXX / C1 - 152_2009 - 02 - 17_2009 - 04 - 02 / 5 / Solexa - 10265 / 30 R47AAXX .5 . aligned . bam \
* /
2009-05-13 03:46:34 +08:00
public class AnnotateSecondaryBase extends CommandLineProgram {
public static AnnotateSecondaryBase Instance = null ;
2009-05-22 03:40:47 +08:00
@Argument ( fullName = "bustard_dir" , shortName = "D" , doc = "Illumina Bustard directory" ) public File BUSTARD_DIR ;
2009-05-13 03:46:34 +08:00
@Argument ( fullName = "lane" , shortName = "L" , doc = "Illumina flowcell lane" ) public int LANE ;
2009-05-22 03:40:47 +08:00
@Argument ( fullName = "run_barcode" , shortName = "B" , doc = "Run barcode (embedded as part of the read name; i.e. 30R47AAXX090217)" ) public String RUN_BARCODE ;
@Argument ( fullName = "cycle_ranges" , shortName = "CR" , doc = "Cycle ranges for single-end or paired reads (i.e. '0-50,56-106') (0-based, inclusive)" ) public String CYCLE_RANGES ;
2009-05-13 03:46:34 +08:00
@Argument ( fullName = "sam_out" , shortName = "SO" , doc = "Output path for sam file" ) public File SAM_OUT ;
2009-05-22 03:40:47 +08:00
@Argument ( fullName = "sam_in" , shortName = "SI" , doc = "The file to use for training and annotation" , required = false ) public File SAM_IN ;
@Argument ( fullName = "training_limit" , shortName = "T" , doc = "Number of reads to use for parameter initialization" , required = false ) public int TRAINING_LIMIT = 100000 ;
@Argument ( fullName = "calling_limit" , shortName = "C" , doc = "Number of reads to basecall" , required = false ) public int CALLING_LIMIT = Integer . MAX_VALUE ;
2009-05-13 03:46:34 +08:00
public static void main ( String [ ] argv ) {
Instance = new AnnotateSecondaryBase ( ) ;
start ( Instance , argv ) ;
}
protected int execute ( ) {
2009-05-20 08:07:24 +08:00
ArrayList < Pair < Integer , Integer > > cycleRanges = getCycleRanges ( CYCLE_RANGES ) ;
2009-05-15 00:58:22 +08:00
2009-05-20 08:07:24 +08:00
BasecallingTrainer trainer = new BasecallingTrainer ( BUSTARD_DIR , LANE , TRAINING_LIMIT ) ;
2009-05-22 03:40:47 +08:00
// Iterate through raw Firecrest data and store the first unambiguous N reads up to TRAINING_LIMIT
2009-05-17 12:09:23 +08:00
System . out . println ( "Loading training set from the first " + TRAINING_LIMIT + " unambiguous reads in the raw data..." ) ;
2009-05-20 08:07:24 +08:00
trainer . loadFirstNUnambiguousReadsTrainingSet ( ) ;
2009-05-13 03:46:34 +08:00
2009-05-15 00:58:22 +08:00
// Iterate through the stored training data and add the info to the BasecallingReadModel
2009-05-15 02:01:41 +08:00
System . out . println ( "Applying training set..." ) ;
2009-05-20 08:07:24 +08:00
BasecallingReadModel model = new BasecallingReadModel ( trainer . getTrainingData ( ) ) ;
2009-05-15 00:58:22 +08:00
// Call bases and write results
2009-05-20 08:07:24 +08:00
System . out . println ( "Calling bases..." ) ;
2009-05-15 00:58:22 +08:00
SAMFileHeader sfh = new SAMFileHeader ( ) ;
2009-05-20 08:07:24 +08:00
sfh . setSortOrder ( SAMFileHeader . SortOrder . queryname ) ;
2009-05-22 03:40:47 +08:00
File unalignedSam = ( canAnnotate ( SAM_IN ) ) ? getTempSAMFile ( "unaligned" ) : SAM_OUT ;
SAMFileWriter sfw = new SAMFileWriterFactory ( ) . makeSAMOrBAMWriter ( sfh , false , unalignedSam ) ;
2009-05-15 00:58:22 +08:00
2009-05-20 08:07:24 +08:00
IlluminaParser iparser = new IlluminaParser ( BUSTARD_DIR , LANE ) ;
2009-05-19 01:42:08 +08:00
2009-05-20 08:07:24 +08:00
BasecallingStats bstats = new BasecallingStats ( ) ;
2009-05-19 01:42:08 +08:00
2009-05-20 08:07:24 +08:00
while ( bstats . getReadsTotal ( ) < CALLING_LIMIT & & iparser . next ( ) ) {
RawRead rr = iparser . getRawRead ( ) ;
FourProbRead fpr = model . call ( rr ) ;
2009-05-19 01:42:08 +08:00
2009-05-20 08:07:24 +08:00
for ( int cycleRangeIndex = 0 ; cycleRangeIndex < cycleRanges . size ( ) ; cycleRangeIndex + + ) {
Pair < Integer , Integer > cycleRange = cycleRanges . get ( cycleRangeIndex ) ;
2009-05-19 01:42:08 +08:00
2009-05-20 08:07:24 +08:00
RawRead rrEnd = iparser . getSubset ( cycleRange . getFirst ( ) , cycleRange . getSecond ( ) ) ;
FourProbRead fprEnd = fpr . getSubset ( cycleRange . getFirst ( ) , cycleRange . getSecond ( ) ) ;
2009-05-19 01:42:08 +08:00
2009-05-20 08:07:24 +08:00
sfw . addAlignment ( constructSAMRecord ( rrEnd , fprEnd , sfh , RUN_BARCODE , cycleRanges . size ( ) = = 2 , cycleRangeIndex = = 1 ) ) ;
2009-05-15 00:58:22 +08:00
2009-05-20 08:07:24 +08:00
if ( cycleRangeIndex = = 0 ) {
bstats . update ( rrEnd , fprEnd ) ;
bstats . notifyOnInterval ( 5000 ) ;
}
}
}
2009-05-17 12:09:23 +08:00
2009-05-20 08:07:24 +08:00
bstats . notifyNow ( ) ;
2009-05-19 01:42:08 +08:00
2009-05-20 08:07:24 +08:00
iparser . close ( ) ;
sfw . close ( ) ;
2009-05-19 01:42:08 +08:00
2009-05-20 08:07:24 +08:00
if ( canAnnotate ( SAM_IN ) ) {
// If we're in annotation mode, annotate the aligned BAM file with the SQ tag
2009-05-22 03:40:47 +08:00
System . out . println ( "Annotating aligned SAM file..." ) ;
2009-05-20 08:07:24 +08:00
2009-05-22 03:40:47 +08:00
System . out . println ( " sorting aligned SAM file by read name..." ) ;
File alignedSam = getTempSAMFile ( "aligned" ) ;
sortBAMByReadName ( SAM_IN , alignedSam ) ;
System . out . println ( " merging unaligned and aligned SAM files..." ) ;
File mergedSam = SAM_OUT ;
mergeUnalignedAndAlignedBams ( unalignedSam , alignedSam , mergedSam ) ;
2009-05-13 03:46:34 +08:00
}
2009-05-20 08:07:24 +08:00
System . out . println ( "Done." ) ;
2009-05-13 03:46:34 +08:00
2009-05-20 08:07:24 +08:00
return 0 ;
}
2009-05-19 01:42:08 +08:00
2009-05-22 03:40:47 +08:00
/ * *
* Return a tempfile . This is a laziness method so that I don ' t have to litter my code with try / catch blocks for IOExceptions .
*
* @param prefix the prefix for the temp file
* @return the temp file
* /
private File getTempSAMFile ( String prefix ) {
try {
File tempFile = File . createTempFile ( prefix , ".sam" , SAM_OUT . getParentFile ( ) ) ;
2009-05-22 04:36:12 +08:00
//tempFile.deleteOnExit();
// Ensure that the volumes we're about to use are ready.
PathUtils . refreshVolume ( tempFile ) ;
PathUtils . refreshVolume ( new File ( System . getProperty ( "java.io.tmpdir" ) ) ) ;
2009-05-22 03:40:47 +08:00
return tempFile ;
} catch ( IOException e ) {
throw new StingException ( "Unable to create tempfile in directory " + SAM_OUT . getParent ( ) ) ;
}
}
2009-05-20 08:07:24 +08:00
/ * *
* Parse the cycle_ranges string that defines the cycle where a read starts and stops .
* Comma - separated ranges are interpreted to be the first and second end of a pair .
*
* @param cycleRangesString the 0 - based , inclusive , comma - separated ranges ( i . e . ' 0 - 50 , 51 - 100 ' )
* @return an ArrayList of cycle ranges
* /
private ArrayList < Pair < Integer , Integer > > getCycleRanges ( String cycleRangesString ) {
ArrayList < Pair < Integer , Integer > > cycleRanges = new ArrayList < Pair < Integer , Integer > > ( ) ;
2009-05-19 01:42:08 +08:00
2009-05-20 08:07:24 +08:00
String [ ] pieces = cycleRangesString . split ( "," ) ;
Pattern p = Pattern . compile ( "(\\d+)-(\\d+)" ) ;
2009-05-19 01:42:08 +08:00
2009-05-22 03:40:47 +08:00
for ( String piece : pieces ) {
Matcher m = p . matcher ( piece ) ;
2009-05-19 01:42:08 +08:00
2009-05-20 08:07:24 +08:00
if ( m . find ( ) ) {
Integer cycleStart = new Integer ( m . group ( 1 ) ) ;
Integer cycleStop = new Integer ( m . group ( 2 ) ) ;
2009-05-19 01:42:08 +08:00
2009-05-20 08:07:24 +08:00
cycleRanges . add ( new Pair < Integer , Integer > ( cycleStart , cycleStop ) ) ;
2009-05-19 01:42:08 +08:00
}
2009-05-20 08:07:24 +08:00
}
2009-05-19 01:42:08 +08:00
2009-05-20 08:07:24 +08:00
if ( cycleRanges . size ( ) = = 0 ) {
throw new StingException ( "At least one cycle range must be specified." ) ;
2009-05-19 01:42:08 +08:00
}
2009-05-20 08:07:24 +08:00
if ( cycleRanges . size ( ) > 2 ) {
throw new StingException ( cycleRanges . size ( ) + " specified, but we're unable to handle more than 2." ) ;
}
2009-05-15 02:01:41 +08:00
2009-05-20 08:07:24 +08:00
return cycleRanges ;
2009-05-13 03:46:34 +08:00
}
2009-05-19 01:42:08 +08:00
/ * *
* Simple test to determine whether we ' re in aligned bam annotation mode or not .
*
* @param samfile the aligned sam file
* @return true if the file exists , false otherwise
* /
private boolean canAnnotate ( File samfile ) {
return ( samfile ! = null & & samfile . exists ( ) ) ;
}
2009-05-16 05:22:24 +08:00
/ * *
* Construct a SAMRecord object with the specified information . The secondary bases
* will be annotated suchthat they will not conflict with the primary base .
*
2009-05-22 03:40:47 +08:00
* @param rr the raw Illumina read
* @param fpr the four - base distributions for every base in the read
* @param sfh the SAM header
* @param runBarcode the run barcode of the lane ( used to prefix the reads )
* @param isPaired is this a paired - end lane ?
* @param isSecondEndOfPair is this the second end of the pair ?
2009-05-16 05:22:24 +08:00
*
* @return a fully - constructed SAM record
* /
2009-05-22 03:40:47 +08:00
private SAMRecord constructSAMRecord ( RawRead rr , FourProbRead fpr , SAMFileHeader sfh , String runBarcode , boolean isPaired , boolean isSecondEndOfPair ) {
2009-05-15 00:58:22 +08:00
SAMRecord sr = new SAMRecord ( sfh ) ;
sr . setReadName ( runBarcode + ":" + rr . getReadKey ( ) + "#0" ) ;
sr . setReadUmappedFlag ( true ) ;
sr . setReadString ( rr . getSequenceAsString ( ) ) ;
sr . setBaseQualities ( rr . getQuals ( ) ) ;
2009-05-20 08:07:24 +08:00
sr . setReadPairedFlag ( isPaired ) ;
if ( isPaired ) {
2009-05-22 03:40:47 +08:00
sr . setMateUnmappedFlag ( true ) ;
sr . setFirstOfPairFlag ( ! isSecondEndOfPair ) ;
2009-05-20 08:07:24 +08:00
}
2009-05-15 00:58:22 +08:00
sr . setAttribute ( "SQ" , fpr . getSQTag ( rr ) ) ;
2009-05-13 03:46:34 +08:00
2009-05-15 00:58:22 +08:00
return sr ;
2009-05-13 03:46:34 +08:00
}
2009-05-20 08:07:24 +08:00
/ * *
* Resorts a SAM file to queryname order .
*
* @param samFile the input SAM file
* @param sortedSamFile the sorted SAM output file
* /
private void sortBAMByReadName ( File samFile , File sortedSamFile ) {
SAMFileReader samIn = new SAMFileReader ( samFile ) ;
SAMFileHeader sfh = samIn . getFileHeader ( ) ;
sfh . setSortOrder ( SAMFileHeader . SortOrder . queryname ) ;
SAMFileWriter samOut = new SAMFileWriterFactory ( ) . makeSAMOrBAMWriter ( sfh , false , sortedSamFile ) ;
for ( SAMRecord sr : samIn ) {
samOut . addAlignment ( sr ) ;
}
samIn . close ( ) ;
samOut . close ( ) ;
}
/ * *
* Merges two SAM files that have been sorted in queryname order
*
2009-05-22 03:40:47 +08:00
* @param queryNameSortedUnalignedSam the sorted unaligned SAM file
* @param queryNameSortedAlignedSam the sorted aligned SAM file
* @param mergedSam the output file where the merged results should be stored
2009-05-20 08:07:24 +08:00
* /
private void mergeUnalignedAndAlignedBams ( File queryNameSortedUnalignedSam , File queryNameSortedAlignedSam , File mergedSam ) {
SAMFileReader usam = new SAMFileReader ( queryNameSortedUnalignedSam ) ;
SAMFileReader asam = new SAMFileReader ( queryNameSortedAlignedSam ) ;
SAMFileHeader sfh = asam . getFileHeader ( ) ;
sfh . setSortOrder ( SAMFileHeader . SortOrder . coordinate ) ;
SAMFileWriter samOut = new SAMFileWriterFactory ( ) . makeSAMOrBAMWriter ( sfh , false , mergedSam ) ;
CloseableIterator < SAMRecord > usamIt = usam . iterator ( ) ;
CloseableIterator < SAMRecord > asamIt = asam . iterator ( ) ;
SAMRecord usr = usamIt . next ( ) ;
SAMRecord asr = asamIt . next ( ) ;
do {
2009-05-22 03:40:47 +08:00
// Pull a record from the unaligned file and advance the aligned file until we find the matching record. We
// don't have to advance the unaligned file until we find our record because we assume every record we generate
// will be in the aligned file (which also contains unaligned reads).
//
// If Picard ever stops storing the unaligned reads, this logic will need to be rewritten.
if ( usr . getReadName ( ) . equals ( asr . getReadName ( ) ) & & usr . getFirstOfPairFlag ( ) = = asr . getFirstOfPairFlag ( ) ) {
2009-05-20 08:07:24 +08:00
byte [ ] sqtag = ( byte [ ] ) usr . getAttribute ( "SQ" ) ;
2009-05-22 03:40:47 +08:00
String usrread = usr . getReadString ( ) ;
String asrread = asr . getReadString ( ) ;
2009-05-20 08:07:24 +08:00
2009-05-22 03:40:47 +08:00
if ( asr . getReadNegativeStrandFlag ( ) ) {
sqtag = QualityUtils . reverseComplementCompressedQualityArray ( sqtag ) ;
asrread = BaseUtils . simpleReverseComplement ( asrread ) ;
}
2009-05-20 08:07:24 +08:00
2009-05-22 03:40:47 +08:00
if ( usrread ! = null & & asrread ! = null & & ! usrread . equals ( asrread ) ) {
throw new StingException (
String . format ( "Purportedly identical unaligned and aligned reads have different read sequences. Perhaps this lane was reanalyzed by the Illumina software but not the production pipeline?\n '%s:%b:%s'\n '%s:%b:%s'" ,
usr . getReadName ( ) , usr . getFirstOfPairFlag ( ) , usrread ,
asr . getReadName ( ) , asr . getFirstOfPairFlag ( ) , asrread ) ) ;
2009-05-20 08:07:24 +08:00
}
2009-05-22 03:40:47 +08:00
asr . setAttribute ( "SQ" , sqtag ) ;
2009-05-20 08:07:24 +08:00
usr = usamIt . next ( ) ;
2009-05-22 03:40:47 +08:00
} else {
2009-05-20 08:07:24 +08:00
asr = asamIt . next ( ) ;
}
2009-05-22 03:40:47 +08:00
samOut . addAlignment ( asr ) ;
2009-05-20 08:07:24 +08:00
} while ( usamIt . hasNext ( ) & & asamIt . hasNext ( ) ) ;
usam . close ( ) ;
asam . close ( ) ;
samOut . close ( ) ;
}
2009-05-22 03:40:47 +08:00
/ * *
* For debugging purposes . Spits out relevant information for two SAMRecords .
*
* @param sra first SAMRecord
* @param srb second SAMRecord
* /
private void printRecords ( SAMRecord sra , SAMRecord srb ) {
System . out . println ( "a: " + sra . getReadName ( ) + " " + sra . getFirstOfPairFlag ( ) ) ;
System . out . println ( "b: " + srb . getReadName ( ) + " " + srb . getFirstOfPairFlag ( ) ) ;
System . out . println ( ) ;
}
2009-05-13 03:46:34 +08:00
}