2009-05-15 00:53:07 +08:00
package org.broadinstitute.sting.secondarybase ;
2009-05-15 02:58:43 +08:00
import edu.mit.broad.picard.util.BasicTextFileParser ;
import edu.mit.broad.picard.util.PasteParser ;
2009-05-15 00:53:07 +08:00
import org.broadinstitute.sting.utils.StingException ;
2009-05-15 02:58:43 +08:00
import java.io.Closeable ;
2009-05-15 00:53:07 +08:00
import java.io.File ;
import java.io.FilenameFilter ;
import java.util.Arrays ;
import java.util.Comparator ;
2009-05-15 02:58:43 +08:00
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;
2009-05-15 00:53:07 +08:00
2009-05-22 03:40:47 +08:00
/ * *
* IlluminaParser parses raw Illumina data ( raw intensities , basecalled read sequences and quality scores )
* and presents it to the developer in an easy - to - use form . It also permits random tile jumping .
*
* WARNING : This parser does not understand newer GAPipeline data formats , and instead relies on the older
* data formats that may have been generated with older Illumina tools . As a result , this may return
* suboptimal data . This parser only exists temporarily until the Picard team writes a much more sensible
* version . Proceed with caution .
*
* @author Kiran Garimella
* /
2009-05-20 08:09:20 +08:00
public class IlluminaParser implements Closeable {
2009-05-15 00:53:07 +08:00
private File bustardDir ;
private File firecrestDir ;
private int lane ;
private File [ ] intfiles ;
private File [ ] seqfiles ;
private File [ ] prbfiles ;
private int currentTileIndex ;
private PasteParser currentTileParser ;
2009-05-20 08:09:20 +08:00
private String [ ] [ ] currentParseResult ;
2009-05-15 00:53:07 +08:00
2009-05-22 03:40:47 +08:00
/ * *
* Construct an IlluminaParser given the Bustard directory and lane . Infer the Firecrest directory .
*
* @param bustardDir the Illumina Bustard directory
* @param lane the Illumina lane
* /
2009-05-20 08:09:20 +08:00
public IlluminaParser ( File bustardDir , int lane ) {
2009-05-15 00:53:07 +08:00
this . bustardDir = bustardDir ;
this . firecrestDir = bustardDir . getParentFile ( ) ;
this . lane = lane ;
initializeParser ( ) ;
}
2009-05-22 03:40:47 +08:00
/ * *
* Construct an IlluminaParser given the Bustard directory , Firecrest directory and lane .
*
* @param bustardDir the Illumina Bustard directory
* @param firecrestDir the Illumina Firecrest directory
* @param lane the Illumina lane
* /
2009-05-20 08:09:20 +08:00
public IlluminaParser ( File bustardDir , File firecrestDir , int lane ) {
2009-05-15 00:53:07 +08:00
this . bustardDir = bustardDir ;
this . firecrestDir = firecrestDir ;
this . lane = lane ;
2009-05-20 08:09:20 +08:00
2009-05-15 00:53:07 +08:00
initializeParser ( ) ;
}
2009-05-22 03:40:47 +08:00
/ * *
* Initialize the parser and seek to the first tile .
* /
2009-05-15 00:53:07 +08:00
private void initializeParser ( ) {
intfiles = firecrestDir . listFiles ( getFilenameFilter ( "int" ) ) ;
seqfiles = bustardDir . listFiles ( getFilenameFilter ( "seq" ) ) ;
prbfiles = bustardDir . listFiles ( getFilenameFilter ( "prb" ) ) ;
if ( intfiles . length ! = seqfiles . length | | intfiles . length ! = prbfiles . length | | seqfiles . length ! = prbfiles . length ) {
throw new StingException (
String . format ( "File list lengths are unequal (int:%d, seq:%d, prb:%d)" ,
intfiles . length ,
seqfiles . length ,
prbfiles . length )
) ;
}
Arrays . sort ( intfiles , getTileSortingComparator ( ) ) ;
Arrays . sort ( seqfiles , getTileSortingComparator ( ) ) ;
Arrays . sort ( prbfiles , getTileSortingComparator ( ) ) ;
2009-05-20 08:09:20 +08:00
seekToTile ( 1 ) ;
2009-05-15 00:53:07 +08:00
// Todo: put some more consistency checks here
}
2009-05-22 03:40:47 +08:00
/ * *
* Get the filename filter for files of a given type .
*
* @param suffix the type ( i . e . ' int ' , ' seq ' , ' prb ' ) .
* @return the filename filter
* /
2009-05-15 00:53:07 +08:00
private FilenameFilter getFilenameFilter ( final String suffix ) {
return new FilenameFilter ( ) {
public boolean accept ( File file , String s ) {
Pattern pseq = Pattern . compile ( String . format ( "s_%d_\\d+_%s\\.txt(?!.+old.+)(\\.gz)?" , lane , suffix ) ) ;
Matcher mseq = pseq . matcher ( s ) ;
return mseq . find ( ) ;
}
} ;
}
2009-05-22 03:40:47 +08:00
/ * *
* Get a comparator that sorts by tile .
*
* @return the comparator that sorts by tile .
* /
2009-05-15 00:53:07 +08:00
private Comparator < File > getTileSortingComparator ( ) {
return new Comparator < File > ( ) {
public int compare ( File file1 , File file2 ) {
Pattern ptile = Pattern . compile ( String . format ( "s_%d_(\\d+)_" , lane ) ) ;
Matcher mtile1 = ptile . matcher ( file1 . getName ( ) ) ;
Matcher mtile2 = ptile . matcher ( file2 . getName ( ) ) ;
if ( mtile1 . find ( ) & & mtile2 . find ( ) ) {
int tile1 = Integer . valueOf ( mtile1 . group ( 1 ) ) ;
int tile2 = Integer . valueOf ( mtile2 . group ( 1 ) ) ;
if ( tile1 < tile2 ) { return - 1 ; }
else if ( tile1 > tile2 ) { return 1 ; }
return 0 ;
}
throw new StingException ( "Tile filenames ('" + file1 . getName ( ) + "' or '" + file2 . getName ( ) + "') did not match against regexp pattern ('" + ptile . pattern ( ) + "')" ) ;
}
} ;
}
2009-05-22 03:40:47 +08:00
/ * *
* Return the number of tiles .
*
* @return the number of tiles .
* /
2009-05-15 00:53:07 +08:00
public int numTiles ( ) { return intfiles . length ; }
2009-05-22 03:40:47 +08:00
/ * *
* Seek to a specified tile .
*
* @param tile the tile to which we should seek
* @return true if we were able to seek to the tile , false if otherwise
* /
2009-05-15 00:53:07 +08:00
public boolean seekToTile ( int tile ) {
if ( tile < intfiles . length - 1 ) {
currentTileIndex = tile - 1 ;
BasicTextFileParser intparser = new BasicTextFileParser ( true , intfiles [ currentTileIndex ] ) ;
BasicTextFileParser seqparser = new BasicTextFileParser ( true , seqfiles [ currentTileIndex ] ) ;
BasicTextFileParser prbparser = new BasicTextFileParser ( true , prbfiles [ currentTileIndex ] ) ;
currentTileParser = new PasteParser ( intparser , seqparser , prbparser ) ;
return true ;
}
return false ;
}
2009-05-22 03:40:47 +08:00
/ * *
* Returns whether the parser has any more data to go through .
*
* @return true if there ' s data left , false if otherwise
* /
2009-05-15 00:53:07 +08:00
public boolean hasNext ( ) {
return ( currentTileParser . hasNext ( ) | | seekToTile ( currentTileIndex + 1 ) ) ;
}
2009-05-22 03:40:47 +08:00
/ * *
* Advance the parser to the next read .
*
* @return true if successful , false if otherwise
* /
2009-05-20 08:09:20 +08:00
public boolean next ( ) {
2009-05-15 00:53:07 +08:00
if ( hasNext ( ) ) {
2009-05-20 08:09:20 +08:00
currentParseResult = currentTileParser . next ( ) ;
return true ;
2009-05-15 00:53:07 +08:00
}
2009-05-20 08:09:20 +08:00
return false ;
}
2009-05-22 03:40:47 +08:00
/ * *
* Returns the result from the current parse as an matrix of Strings .
*
* @return the matrix of Strings containing the current parse result
* /
2009-05-20 08:09:20 +08:00
public String [ ] [ ] getCurrentParseResult ( ) {
return currentParseResult ;
2009-05-15 00:53:07 +08:00
}
2009-05-22 03:40:47 +08:00
/ * *
* Removes , um , something , but in reality , does nothing .
* /
2009-05-15 00:53:07 +08:00
public void remove ( ) {
throw new UnsupportedOperationException ( "IlluminaParser.remove() method is not supported." ) ;
}
2009-05-22 03:40:47 +08:00
/ * *
* Close the current tile .
* /
2009-05-20 08:09:20 +08:00
public void close ( ) {
currentTileParser . close ( ) ;
}
2009-05-15 00:53:07 +08:00
2009-05-22 03:40:47 +08:00
/ * *
* Returns a raw read containing the raw intensities , read sequence , and quality scores .
*
* @return the raw read
* /
2009-05-20 08:09:20 +08:00
public RawRead getRawRead ( ) {
return getSubset ( 0 , currentParseResult [ 1 ] [ 4 ] . length ( ) - 1 ) ;
2009-05-15 00:53:07 +08:00
}
2009-05-22 03:40:47 +08:00
/ * *
* Returns a subset of the current parse result as a raw read .
*
* @param cycleStart the starting cycle for the desired subset
* @param cycleStop the ending cycle for the desired subset
* @return the subset of the current parse result as a raw read
* /
2009-05-20 08:09:20 +08:00
public RawRead getSubset ( int cycleStart , int cycleStop ) {
return new RawRead ( currentParseResult , cycleStart , cycleStop ) ;
2009-05-15 00:53:07 +08:00
}
}