2009-03-16 06:37:20 +08:00
package org.broadinstitute.sting.gatk.refdata ;
2009-07-17 05:03:47 +08:00
import org.apache.log4j.Logger ;
import org.broadinstitute.sting.utils.MalformedGenomeLocException ;
2009-08-12 06:10:20 +08:00
import org.broadinstitute.sting.utils.StingException ;
2009-07-17 05:03:47 +08:00
import org.broadinstitute.sting.utils.Utils ;
import org.broadinstitute.sting.utils.xReadLines ;
2009-08-12 06:10:20 +08:00
import java.io.* ;
2009-04-04 00:41:33 +08:00
import java.lang.reflect.Constructor ;
import java.lang.reflect.InvocationTargetException ;
2009-05-21 23:23:22 +08:00
import java.lang.reflect.Method ;
2009-07-17 05:03:47 +08:00
import java.util.* ;
2009-03-16 06:37:20 +08:00
/ * *
* Class for representing arbitrary reference ordered data sets
2009-08-12 06:10:20 +08:00
* < p / >
2009-03-16 06:37:20 +08:00
* User : mdepristo
* Date : Feb 27 , 2009
* Time : 10 : 47 : 14 AM
* To change this template use File | Settings | File Templates .
* /
public class ReferenceOrderedData < ROD extends ReferenceOrderedDatum > implements Iterable < ROD > {
2009-04-04 00:41:33 +08:00
private String name ;
2009-03-16 06:37:20 +08:00
private File file = null ;
2009-05-15 05:06:28 +08:00
private String fieldDelimiter ;
2009-08-12 06:10:20 +08:00
/** Header object returned from the datum */
2009-05-15 05:06:28 +08:00
private Object header = null ;
2009-08-12 06:10:20 +08:00
2009-03-16 06:37:20 +08:00
private Class < ROD > type = null ; // runtime type information for object construction
2009-08-12 06:10:20 +08:00
/** our log, which we want to capture anything from this class */
private static Logger logger = Logger . getLogger ( ReferenceOrderedData . class ) ;
2009-04-10 06:04:59 +08:00
// ----------------------------------------------------------------------
//
// Static ROD type management
//
// ----------------------------------------------------------------------
public static class RODBinding {
public final String name ;
public final Class < ? extends ReferenceOrderedDatum > type ;
2009-08-12 06:10:20 +08:00
2009-04-10 06:04:59 +08:00
public RODBinding ( final String name , final Class < ? extends ReferenceOrderedDatum > type ) {
this . name = name ;
this . type = type ;
}
}
public static HashMap < String , RODBinding > Types = new HashMap < String , RODBinding > ( ) ;
2009-08-12 06:10:20 +08:00
2009-04-10 06:04:59 +08:00
public static void addModule ( final String name , final Class < ? extends ReferenceOrderedDatum > rodType ) {
2009-07-17 05:03:47 +08:00
final String boundName = name . toLowerCase ( ) ;
2009-08-12 06:10:20 +08:00
if ( Types . containsKey ( boundName ) ) {
2009-07-17 05:03:47 +08:00
throw new RuntimeException ( String . format ( "GATK BUG: adding ROD module %s that is already bound" , boundName ) ) ;
}
2009-09-05 03:13:37 +08:00
logger . info ( String . format ( "* Adding rod class %s" , name ) ) ;
2009-07-17 05:03:47 +08:00
Types . put ( boundName , new RODBinding ( name , rodType ) ) ;
2009-04-10 06:04:59 +08:00
}
static {
// All known ROD types
2009-09-05 02:40:43 +08:00
addModule ( "GFF" , RodGenotypeChipAsGFF . class ) ;
2009-04-10 06:04:59 +08:00
addModule ( "dbSNP" , rodDbSNP . class ) ;
addModule ( "HapMapAlleleFrequencies" , HapMapAlleleFrequenciesROD . class ) ;
addModule ( "SAMPileup" , rodSAMPileup . class ) ;
2009-07-09 11:16:27 +08:00
addModule ( "GELI" , rodGELI . class ) ;
2009-05-27 05:06:44 +08:00
addModule ( "RefSeq" , rodRefSeq . class ) ;
2009-05-15 05:06:28 +08:00
addModule ( "Table" , TabularROD . class ) ;
2009-05-28 06:02:24 +08:00
addModule ( "PooledEM" , PooledEMSNPROD . class ) ;
2009-08-31 12:32:32 +08:00
addModule ( "CleanedOutSNP" , CleanedOutSNPROD . class ) ;
2009-06-30 00:32:12 +08:00
addModule ( "SangerSNP" , SangerSNPROD . class ) ;
2009-07-18 00:05:51 +08:00
addModule ( "SimpleIndel" , SimpleIndelROD . class ) ;
2009-09-03 03:32:29 +08:00
addModule ( "PointIndel" , PointIndelROD . class ) ;
2009-07-06 00:28:24 +08:00
addModule ( "HapMapGenotype" , HapMapGenotypeROD . class ) ;
2009-06-06 07:34:37 +08:00
addModule ( "Intervals" , IntervalRod . class ) ;
2009-09-05 02:40:43 +08:00
addModule ( "Variants" , RodGeliText . class ) ;
2009-07-17 05:03:47 +08:00
addModule ( "GLF" , RodGLF . class ) ;
2009-04-10 06:04:59 +08:00
}
/ * *
* Parse the ROD bindings . These are of the form of a single list of strings , each triplet of the
2009-05-08 23:28:19 +08:00
* form < name > , < type > , < file > . After this function , the List of RODs contains new RODs bound to each of
2009-04-10 06:04:59 +08:00
* name , of type , ready to read from the file . This function does check for the strings to be well formed
* and such .
*
* @param bindings
* @param rods
* /
2009-08-12 06:10:20 +08:00
public static void parseBindings ( ArrayList < String > bindings , List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
// pre-process out any files that were passed in as rod binding command line options
for ( int x = 0 ; x < bindings . size ( ) ; x + + ) {
if ( new File ( bindings . get ( x ) ) . exists ( ) ) {
extractRodsFromFile ( bindings , bindings . get ( x ) ) ;
bindings . remove ( x ) ;
x - - ;
}
}
2009-04-10 06:04:59 +08:00
// Loop over triplets
2009-08-12 06:10:20 +08:00
for ( String bindingSets : bindings ) {
2009-05-28 06:02:24 +08:00
String [ ] bindingTokens = bindingSets . split ( "," ) ;
2009-08-12 06:10:20 +08:00
if ( bindingTokens . length % 3 ! = 0 )
2009-05-08 23:28:19 +08:00
Utils . scareUser ( String . format ( "Invalid ROD specification: requires triplets of <name>,<type>,<file> but got %s" , Utils . join ( "," , bindings ) ) ) ;
2009-08-12 06:10:20 +08:00
for ( int bindingSet = 0 ; bindingSet < bindingTokens . length ; bindingSet + = 3 ) {
2009-05-28 06:02:24 +08:00
logger . info ( "Processing ROD bindings: " + bindings . size ( ) + " -> " + Utils . join ( " : " , bindingTokens ) ) ;
2009-04-10 06:04:59 +08:00
2009-05-28 06:02:24 +08:00
final String name = bindingTokens [ bindingSet ] ;
final String typeName = bindingTokens [ bindingSet + 1 ] ;
final String fileName = bindingTokens [ bindingSet + 2 ] ;
2009-04-10 06:04:59 +08:00
2009-08-12 06:10:20 +08:00
ReferenceOrderedData < ? > rod = parse1Binding ( name , typeName , fileName ) ;
2009-04-10 06:04:59 +08:00
2009-05-28 06:02:24 +08:00
// check that we're not generating duplicate bindings
2009-08-12 06:10:20 +08:00
for ( ReferenceOrderedData rod2 : rods )
if ( rod2 . getName ( ) . equals ( rod . getName ( ) ) )
2009-05-28 06:02:24 +08:00
Utils . scareUser ( String . format ( "Found duplicate rod bindings" , rod . getName ( ) ) ) ;
rods . add ( rod ) ;
}
2009-04-10 06:04:59 +08:00
}
}
2009-08-12 06:10:20 +08:00
/ * *
* given a existing file , open it and append all the valid triplet lines to an existing list
*
* @param rodTripletList the list of existing triplets
* @param filename the file to attempt to extract ROD triplets from
* /
protected static void extractRodsFromFile ( List < String > rodTripletList , String filename ) {
BufferedReader str ;
try {
str = new BufferedReader ( new FileReader ( new File ( filename ) ) ) ;
} catch ( FileNotFoundException e ) {
throw new StingException ( "Unable to load the ROD input file " + filename , e ) ;
}
String line = "NO LINES READ IN" ;
try {
while ( ( line = str . readLine ( ) ) ! = null ) {
if ( line . matches ( ".+,.+,.+" ) ) rodTripletList . add ( line . trim ( ) ) ;
else logger . warn ( "the following file line didn't parsing into a triplet -> " + line ) ;
}
} catch ( IOException e ) {
throw new StingException ( "Failed reading the input rod file " + filename + " last line read was " + line , e ) ;
}
}
2009-04-10 06:04:59 +08:00
/ * *
* Helpful function that parses a single triplet of < name > < type > < file > and returns the corresponding ROD with
* < name > , of type < type > that reads its input from < file > .
2009-08-12 06:10:20 +08:00
*
2009-04-10 06:04:59 +08:00
* @param trackName
* @param typeName
* @param fileName
* @return
* /
2009-08-12 06:10:20 +08:00
private static ReferenceOrderedData < ? > parse1Binding ( final String trackName , final String typeName , final String fileName ) {
2009-04-10 06:04:59 +08:00
// Gracefully fail if we don't have the type
2009-08-12 06:10:20 +08:00
if ( ReferenceOrderedData . Types . get ( typeName . toLowerCase ( ) ) = = null )
2009-04-10 06:04:59 +08:00
Utils . scareUser ( String . format ( "Unknown ROD type: %s" , typeName ) ) ;
// Lookup the type
Class rodClass = ReferenceOrderedData . Types . get ( typeName . toLowerCase ( ) ) . type ;
// Create the ROD
ReferenceOrderedData < ? > rod = new ReferenceOrderedData < ReferenceOrderedDatum > ( trackName . toLowerCase ( ) , new File ( fileName ) , rodClass ) ;
logger . info ( String . format ( "Created binding from %s to %s of type %s" , trackName . toLowerCase ( ) , fileName , rodClass ) ) ;
return rod ;
}
// ----------------------------------------------------------------------
//
// Constructors
//
// ----------------------------------------------------------------------
2009-04-04 00:41:33 +08:00
public ReferenceOrderedData ( final String name , File file , Class < ROD > type ) {
2009-03-16 06:37:20 +08:00
this . file = file ;
this . type = type ;
2009-04-04 00:41:33 +08:00
this . name = name ;
2009-05-15 05:06:28 +08:00
this . header = initializeROD ( name , file , type ) ;
2009-05-15 07:20:11 +08:00
this . fieldDelimiter = newROD ( name , type ) . delimiterRegex ( ) ;
2009-03-16 06:37:20 +08:00
}
2009-04-04 03:54:54 +08:00
public String getName ( ) { return name ; }
2009-05-20 07:26:17 +08:00
/ * *
2009-05-24 04:50:28 +08:00
* Special equals override to see if this ROD is compatible with the given
* name and type . ' Compatible ' means that this ROD has the name that ' s passed
* in and its data can fit into the container specified by type .
2009-08-12 06:10:20 +08:00
*
2009-05-20 07:26:17 +08:00
* @param name Name to check .
* @param type Type to check .
2009-08-12 06:10:20 +08:00
*
2009-05-20 07:26:17 +08:00
* @return True if these parameters imply this rod . False otherwise .
* /
2009-08-12 06:10:20 +08:00
public boolean matches ( String name , Class < ? extends ReferenceOrderedDatum > type ) {
2009-05-24 04:50:28 +08:00
return this . name . equals ( name ) & & type . isAssignableFrom ( this . type ) ;
2009-05-20 07:26:17 +08:00
}
2009-06-06 07:34:37 +08:00
public RODIterator < ROD > iterator ( ) {
2009-08-12 06:10:20 +08:00
Iterator < ROD > it ;
2009-05-21 23:23:22 +08:00
try {
2009-08-12 06:10:20 +08:00
Method m = type . getDeclaredMethod ( "createIterator" , String . class , java . io . File . class ) ;
2009-05-21 23:23:22 +08:00
it = ( Iterator < ROD > ) m . invoke ( null , name , file ) ;
2009-08-12 06:10:20 +08:00
} catch ( java . lang . NoSuchMethodException e ) {
2009-05-21 23:23:22 +08:00
it = new SimpleRODIterator ( ) ;
2009-08-12 06:10:20 +08:00
} catch ( java . lang . NullPointerException e ) {
2009-05-21 23:23:22 +08:00
throw new RuntimeException ( e ) ;
2009-08-12 06:10:20 +08:00
} catch ( java . lang . SecurityException e ) {
2009-05-21 23:23:22 +08:00
throw new RuntimeException ( e ) ;
2009-08-12 06:10:20 +08:00
} catch ( java . lang . IllegalAccessException e ) {
throw new RuntimeException ( e ) ;
} catch ( java . lang . IllegalArgumentException e ) {
throw new RuntimeException ( e ) ;
} catch ( java . lang . reflect . InvocationTargetException e ) {
throw new RuntimeException ( e ) ;
}
2009-06-06 07:34:37 +08:00
return new RODIterator < ROD > ( it ) ;
2009-08-12 06:10:20 +08:00
}
2009-03-16 06:37:20 +08:00
// ----------------------------------------------------------------------
//
// Testing
//
// ----------------------------------------------------------------------
public void testMe ( ) {
2009-08-12 06:10:20 +08:00
for ( ReferenceOrderedDatum rec : this ) {
2009-04-02 06:54:38 +08:00
System . out . println ( rec . toString ( ) ) ;
2009-09-05 02:40:43 +08:00
RodGenotypeChipAsGFF gff = ( RodGenotypeChipAsGFF ) rec ;
2009-04-02 06:54:38 +08:00
String [ ] keys = { "LENGTH" , "ALT" , "FOBARBAR" } ;
2009-08-12 06:10:20 +08:00
for ( String key : keys ) {
2009-04-02 06:54:38 +08:00
System . out . printf ( " -> %s is (%s)%n" , key , gff . containsAttribute ( key ) ? gff . getAttribute ( key ) : "none" ) ;
2009-03-16 06:37:20 +08:00
}
}
System . exit ( 1 ) ;
}
// ----------------------------------------------------------------------
//
// Manipulations of all of the data
//
// ----------------------------------------------------------------------
public ArrayList < ReferenceOrderedDatum > readAll ( ) {
ArrayList < ReferenceOrderedDatum > elts = new ArrayList < ReferenceOrderedDatum > ( ) ;
2009-08-12 06:10:20 +08:00
for ( ReferenceOrderedDatum rec : this ) {
2009-03-16 06:37:20 +08:00
elts . add ( rec ) ;
}
elts . trimToSize ( ) ;
return elts ;
}
public static void sortRODDataInMemory ( ArrayList < ReferenceOrderedDatum > data ) {
Collections . sort ( data ) ;
}
public static void write ( ArrayList < ReferenceOrderedDatum > data , File output ) throws IOException {
final FileWriter out = new FileWriter ( output ) ;
2009-08-12 06:10:20 +08:00
for ( ReferenceOrderedDatum rec : data ) {
2009-03-16 06:37:20 +08:00
out . write ( rec . repl ( ) + "\n" ) ;
}
out . close ( ) ;
}
public boolean validateFile ( ) throws Exception {
ReferenceOrderedDatum last = null ;
2009-08-12 06:10:20 +08:00
for ( ReferenceOrderedDatum rec : this ) {
if ( last ! = null & & last . compareTo ( rec ) = = 1 ) {
// It's out of order
throw new Exception ( "Out of order elements at \n" + last . toString ( ) + "\n" + rec . toString ( ) ) ;
}
last = rec ;
2009-03-16 06:37:20 +08:00
}
return true ;
}
public void indexFile ( ) {
// Fixme -- get access to the linear index system from Jim
}
// ----------------------------------------------------------------------
//
// Iteration
//
// ----------------------------------------------------------------------
private class SimpleRODIterator implements Iterator < ROD > {
2009-04-02 06:54:38 +08:00
private xReadLines parser = null ;
2009-03-16 06:37:20 +08:00
public SimpleRODIterator ( ) {
2009-04-02 06:54:38 +08:00
try {
parser = new xReadLines ( file ) ;
2009-08-12 06:10:20 +08:00
} catch ( FileNotFoundException e ) {
2009-04-02 06:54:38 +08:00
Utils . scareUser ( "Couldn't open file: " + file ) ;
}
2009-03-16 06:37:20 +08:00
}
public boolean hasNext ( ) {
2009-04-02 06:54:38 +08:00
//System.out.printf("Parser has next: %b%n", parser.hasNext());
2009-03-16 06:37:20 +08:00
return parser . hasNext ( ) ;
}
public ROD next ( ) {
2009-06-03 02:14:46 +08:00
ROD n = null ;
boolean success = false ;
boolean firstFailure = true ;
do {
final String line = parser . next ( ) ;
2009-06-09 23:39:40 +08:00
//System.out.printf("Line is '%s'%n", line);
2009-06-03 02:14:46 +08:00
String parts [ ] = line . split ( fieldDelimiter ) ;
2009-08-12 06:10:20 +08:00
2009-06-03 02:14:46 +08:00
try {
n = parseLine ( parts ) ;
// Two failure conditions:
// 1) parseLine throws an exception.
// 2) parseLine returns null.
// 3) parseLine throws a RuntimeException.
// TODO: Clean this up so that all errors are handled in one spot.
success = ( n ! = null ) ;
}
2009-08-12 06:10:20 +08:00
catch ( MalformedGenomeLocException ex ) {
if ( firstFailure ) {
2009-06-05 23:49:03 +08:00
Utils . warnUser ( "Failed to parse contig on line '" + line + "'. The reason given was: " + ex . getMessage ( ) + " Skipping ahead to the next recognized GenomeLoc. " ) ;
2009-06-03 02:14:46 +08:00
firstFailure = false ;
}
2009-08-12 06:10:20 +08:00
if ( ! parser . hasNext ( ) )
2009-06-03 02:14:46 +08:00
Utils . warnUser ( "Unable to find more valid reference-ordered data. Giving up." ) ;
}
} while ( ! success & & parser . hasNext ( ) ) ;
return n ;
2009-03-16 06:37:20 +08:00
}
2009-08-12 06:10:20 +08:00
2009-03-16 06:37:20 +08:00
public void remove ( ) {
throw new UnsupportedOperationException ( ) ;
}
}
// ----------------------------------------------------------------------
//
// Parsing
//
// ----------------------------------------------------------------------
2009-08-12 06:10:20 +08:00
private Constructor < ROD > parsing_constructor ;
private ROD newROD ( final String name , final Class < ROD > type ) {
2009-03-16 06:37:20 +08:00
try {
2009-08-12 06:10:20 +08:00
return ( ROD ) parsing_constructor . newInstance ( name ) ;
} catch ( java . lang . InstantiationException e ) {
2009-05-15 05:06:28 +08:00
throw new RuntimeException ( e ) ;
2009-08-12 06:10:20 +08:00
} catch ( java . lang . IllegalAccessException e ) {
2009-05-15 05:06:28 +08:00
throw new RuntimeException ( e ) ;
2009-08-12 06:10:20 +08:00
} catch ( InvocationTargetException e ) {
2009-05-15 05:06:28 +08:00
throw new RuntimeException ( e ) ;
}
}
private Object initializeROD ( final String name , final File file , final Class < ROD > type ) {
2009-08-12 06:10:20 +08:00
try {
parsing_constructor = type . getConstructor ( String . class ) ;
}
catch ( java . lang . NoSuchMethodException e ) {
throw new RuntimeException ( e ) ;
}
2009-05-15 05:06:28 +08:00
ROD rod = newROD ( name , type ) ;
try {
return rod . initialize ( file ) ;
2009-08-12 06:10:20 +08:00
} catch ( FileNotFoundException e ) {
2009-05-15 05:06:28 +08:00
throw new RuntimeException ( e ) ;
}
}
private ROD parseLine ( final String [ ] parts ) {
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
ROD obj = newROD ( name , type ) ;
try {
2009-08-12 06:10:20 +08:00
if ( ! obj . parseLine ( header , parts ) )
2009-05-15 05:06:28 +08:00
obj = null ;
} catch ( IOException e ) {
throw new RuntimeException ( "Badly formed ROD: " + e ) ;
2009-04-04 00:41:33 +08:00
}
2009-05-15 05:06:28 +08:00
return obj ;
2009-03-16 06:37:20 +08:00
}
}