2009-03-16 06:37:20 +08:00
package org.broadinstitute.sting.gatk.refdata ;
import java.io.File ;
import java.io.FileWriter ;
import java.io.IOException ;
2009-04-02 06:54:38 +08:00
import java.io.FileNotFoundException ;
2009-04-10 06:04:59 +08:00
import java.util.* ;
2009-04-04 00:41:33 +08:00
import java.lang.reflect.Constructor ;
import java.lang.reflect.InvocationTargetException ;
2009-05-21 23:23:22 +08:00
import java.lang.reflect.Method ;
2009-03-16 06:37:20 +08:00
import org.broadinstitute.sting.gatk.iterators.PushbackIterator ;
2009-05-28 02:20:43 +08:00
import org.broadinstitute.sting.gatk.refdata.rodRefSeq ;
2009-03-16 06:37:20 +08:00
import org.broadinstitute.sting.utils.GenomeLoc ;
2009-04-02 06:54:38 +08:00
import org.broadinstitute.sting.utils.xReadLines ;
import org.broadinstitute.sting.utils.Utils ;
2009-06-03 02:14:46 +08:00
import org.broadinstitute.sting.utils.MalformedGenomeLocException ;
2009-04-10 06:04:59 +08:00
import org.apache.log4j.Logger ;
2009-03-16 06:37:20 +08:00
/ * *
* Class for representing arbitrary reference ordered data sets
*
* User : mdepristo
* Date : Feb 27 , 2009
* Time : 10 : 47 : 14 AM
* To change this template use File | Settings | File Templates .
* /
public class ReferenceOrderedData < ROD extends ReferenceOrderedDatum > implements Iterable < ROD > {
2009-04-04 00:41:33 +08:00
private String name ;
2009-03-16 06:37:20 +08:00
private File file = null ;
2009-05-15 05:06:28 +08:00
private String fieldDelimiter ;
/ * *
* Header object returned from the datum
* /
private Object header = null ;
2009-03-16 06:37:20 +08:00
private Class < ROD > type = null ; // runtime type information for object construction
2009-04-10 06:04:59 +08:00
// ----------------------------------------------------------------------
//
// Static ROD type management
//
// ----------------------------------------------------------------------
public static class RODBinding {
public final String name ;
public final Class < ? extends ReferenceOrderedDatum > type ;
public RODBinding ( final String name , final Class < ? extends ReferenceOrderedDatum > type ) {
this . name = name ;
this . type = type ;
}
}
public static HashMap < String , RODBinding > Types = new HashMap < String , RODBinding > ( ) ;
public static void addModule ( final String name , final Class < ? extends ReferenceOrderedDatum > rodType ) {
System . out . printf ( "* Adding rod class %s%n" , name ) ;
Types . put ( name . toLowerCase ( ) , new RODBinding ( name , rodType ) ) ;
}
static {
// All known ROD types
addModule ( "GFF" , rodGFF . class ) ;
addModule ( "dbSNP" , rodDbSNP . class ) ;
addModule ( "HapMapAlleleFrequencies" , HapMapAlleleFrequenciesROD . class ) ;
addModule ( "SAMPileup" , rodSAMPileup . class ) ;
2009-05-27 05:06:44 +08:00
addModule ( "RefSeq" , rodRefSeq . class ) ;
2009-05-15 05:06:28 +08:00
addModule ( "Table" , TabularROD . class ) ;
2009-05-28 06:02:24 +08:00
addModule ( "PooledEM" , PooledEMSNPROD . class ) ;
2009-06-12 02:56:37 +08:00
addModule ( "1KGSNPs" , KGenomesSNPROD . class ) ;
2009-06-30 00:32:12 +08:00
addModule ( "SangerSNP" , SangerSNPROD . class ) ;
2009-07-06 00:28:24 +08:00
addModule ( "HapMapGenotype" , HapMapGenotypeROD . class ) ;
2009-06-06 07:34:37 +08:00
addModule ( "Intervals" , IntervalRod . class ) ;
2009-06-18 05:33:13 +08:00
addModule ( "Variants" , rodVariants . class ) ;
2009-04-10 06:04:59 +08:00
}
/ * *
* Parse the ROD bindings . These are of the form of a single list of strings , each triplet of the
2009-05-08 23:28:19 +08:00
* form < name > , < type > , < file > . After this function , the List of RODs contains new RODs bound to each of
2009-04-10 06:04:59 +08:00
* name , of type , ready to read from the file . This function does check for the strings to be well formed
* and such .
*
* @param logger
* @param bindings
* @param rods
* /
public static void parseBindings ( Logger logger , ArrayList < String > bindings , List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods )
{
// Loop over triplets
2009-05-28 06:02:24 +08:00
for ( String bindingSets : bindings ) {
String [ ] bindingTokens = bindingSets . split ( "," ) ;
if ( bindingTokens . length % 3 ! = 0 )
2009-05-08 23:28:19 +08:00
Utils . scareUser ( String . format ( "Invalid ROD specification: requires triplets of <name>,<type>,<file> but got %s" , Utils . join ( "," , bindings ) ) ) ;
2009-05-28 06:02:24 +08:00
for ( int bindingSet = 0 ; bindingSet < bindingTokens . length ; bindingSet + = 3 ) {
logger . info ( "Processing ROD bindings: " + bindings . size ( ) + " -> " + Utils . join ( " : " , bindingTokens ) ) ;
2009-04-10 06:04:59 +08:00
2009-05-28 06:02:24 +08:00
final String name = bindingTokens [ bindingSet ] ;
final String typeName = bindingTokens [ bindingSet + 1 ] ;
final String fileName = bindingTokens [ bindingSet + 2 ] ;
2009-04-10 06:04:59 +08:00
2009-05-28 06:02:24 +08:00
ReferenceOrderedData < ? > rod = parse1Binding ( logger , name , typeName , fileName ) ;
2009-04-10 06:04:59 +08:00
2009-05-28 06:02:24 +08:00
// check that we're not generating duplicate bindings
for ( ReferenceOrderedData rod2 : rods )
if ( rod2 . getName ( ) . equals ( rod . getName ( ) ) )
Utils . scareUser ( String . format ( "Found duplicate rod bindings" , rod . getName ( ) ) ) ;
rods . add ( rod ) ;
}
2009-04-10 06:04:59 +08:00
}
}
/ * *
* Helpful function that parses a single triplet of < name > < type > < file > and returns the corresponding ROD with
* < name > , of type < type > that reads its input from < file > .
*
* @param logger
* @param trackName
* @param typeName
* @param fileName
* @return
* /
private static ReferenceOrderedData < ? > parse1Binding ( Logger logger , final String trackName , final String typeName , final String fileName )
{
// Gracefully fail if we don't have the type
if ( ReferenceOrderedData . Types . get ( typeName . toLowerCase ( ) ) = = null )
Utils . scareUser ( String . format ( "Unknown ROD type: %s" , typeName ) ) ;
// Lookup the type
Class rodClass = ReferenceOrderedData . Types . get ( typeName . toLowerCase ( ) ) . type ;
// Create the ROD
ReferenceOrderedData < ? > rod = new ReferenceOrderedData < ReferenceOrderedDatum > ( trackName . toLowerCase ( ) , new File ( fileName ) , rodClass ) ;
logger . info ( String . format ( "Created binding from %s to %s of type %s" , trackName . toLowerCase ( ) , fileName , rodClass ) ) ;
return rod ;
}
// ----------------------------------------------------------------------
//
// Constructors
//
// ----------------------------------------------------------------------
2009-04-04 00:41:33 +08:00
public ReferenceOrderedData ( final String name , File file , Class < ROD > type ) {
2009-03-16 06:37:20 +08:00
this . file = file ;
this . type = type ;
2009-04-04 00:41:33 +08:00
this . name = name ;
2009-05-15 05:06:28 +08:00
this . header = initializeROD ( name , file , type ) ;
2009-05-15 07:20:11 +08:00
this . fieldDelimiter = newROD ( name , type ) . delimiterRegex ( ) ;
2009-03-16 06:37:20 +08:00
}
2009-04-04 03:54:54 +08:00
public String getName ( ) { return name ; }
2009-05-20 07:26:17 +08:00
/ * *
2009-05-24 04:50:28 +08:00
* Special equals override to see if this ROD is compatible with the given
* name and type . ' Compatible ' means that this ROD has the name that ' s passed
* in and its data can fit into the container specified by type .
2009-05-20 07:26:17 +08:00
* @param name Name to check .
* @param type Type to check .
* @return True if these parameters imply this rod . False otherwise .
* /
public boolean matches ( String name , Class < ? extends ReferenceOrderedDatum > type ) {
2009-05-24 04:50:28 +08:00
return this . name . equals ( name ) & & type . isAssignableFrom ( this . type ) ;
2009-05-20 07:26:17 +08:00
}
2009-06-06 07:34:37 +08:00
public RODIterator < ROD > iterator ( ) {
2009-05-21 23:23:22 +08:00
Iterator < ROD > it ;
try {
Method m = type . getDeclaredMethod ( "createIterator" , String . class , java . io . File . class ) ;
it = ( Iterator < ROD > ) m . invoke ( null , name , file ) ;
} catch ( java . lang . NoSuchMethodException e ) {
it = new SimpleRODIterator ( ) ;
} catch ( java . lang . NullPointerException e ) {
throw new RuntimeException ( e ) ;
} catch ( java . lang . SecurityException e ) {
throw new RuntimeException ( e ) ;
} catch ( java . lang . IllegalAccessException e ) {
throw new RuntimeException ( e ) ;
} catch ( java . lang . IllegalArgumentException e ) {
throw new RuntimeException ( e ) ;
} catch ( java . lang . reflect . InvocationTargetException e ) {
throw new RuntimeException ( e ) ;
}
2009-06-06 07:34:37 +08:00
return new RODIterator < ROD > ( it ) ;
2009-05-21 23:23:22 +08:00
}
2009-03-16 06:37:20 +08:00
// ----------------------------------------------------------------------
//
// Testing
//
// ----------------------------------------------------------------------
public void testMe ( ) {
for ( ReferenceOrderedDatum rec : this ) {
2009-04-02 06:54:38 +08:00
System . out . println ( rec . toString ( ) ) ;
rodGFF gff = ( rodGFF ) rec ;
String [ ] keys = { "LENGTH" , "ALT" , "FOBARBAR" } ;
for ( String key : keys ) {
System . out . printf ( " -> %s is (%s)%n" , key , gff . containsAttribute ( key ) ? gff . getAttribute ( key ) : "none" ) ;
2009-03-16 06:37:20 +08:00
}
}
System . exit ( 1 ) ;
}
// ----------------------------------------------------------------------
//
// Manipulations of all of the data
//
// ----------------------------------------------------------------------
public ArrayList < ReferenceOrderedDatum > readAll ( ) {
ArrayList < ReferenceOrderedDatum > elts = new ArrayList < ReferenceOrderedDatum > ( ) ;
for ( ReferenceOrderedDatum rec : this ) {
elts . add ( rec ) ;
}
elts . trimToSize ( ) ;
return elts ;
}
public static void sortRODDataInMemory ( ArrayList < ReferenceOrderedDatum > data ) {
Collections . sort ( data ) ;
}
public static void write ( ArrayList < ReferenceOrderedDatum > data , File output ) throws IOException {
final FileWriter out = new FileWriter ( output ) ;
for ( ReferenceOrderedDatum rec : data ) {
out . write ( rec . repl ( ) + "\n" ) ;
}
out . close ( ) ;
}
public boolean validateFile ( ) throws Exception {
ReferenceOrderedDatum last = null ;
for ( ReferenceOrderedDatum rec : this ) {
if ( last ! = null & & last . compareTo ( rec ) = = 1 ) {
// It's out of order
throw new Exception ( "Out of order elements at \n" + last . toString ( ) + "\n" + rec . toString ( ) ) ;
}
last = rec ;
}
return true ;
}
public void indexFile ( ) {
// Fixme -- get access to the linear index system from Jim
}
// ----------------------------------------------------------------------
//
// Iteration
//
// ----------------------------------------------------------------------
private class SimpleRODIterator implements Iterator < ROD > {
2009-04-02 06:54:38 +08:00
private xReadLines parser = null ;
2009-03-16 06:37:20 +08:00
public SimpleRODIterator ( ) {
2009-04-02 06:54:38 +08:00
try {
parser = new xReadLines ( file ) ;
} catch ( FileNotFoundException e ) {
Utils . scareUser ( "Couldn't open file: " + file ) ;
}
2009-03-16 06:37:20 +08:00
}
public boolean hasNext ( ) {
2009-04-02 06:54:38 +08:00
//System.out.printf("Parser has next: %b%n", parser.hasNext());
2009-03-16 06:37:20 +08:00
return parser . hasNext ( ) ;
}
public ROD next ( ) {
2009-06-03 02:14:46 +08:00
ROD n = null ;
boolean success = false ;
boolean firstFailure = true ;
do {
final String line = parser . next ( ) ;
2009-06-09 23:39:40 +08:00
//System.out.printf("Line is '%s'%n", line);
2009-06-03 02:14:46 +08:00
String parts [ ] = line . split ( fieldDelimiter ) ;
try {
n = parseLine ( parts ) ;
// Two failure conditions:
// 1) parseLine throws an exception.
// 2) parseLine returns null.
// 3) parseLine throws a RuntimeException.
// TODO: Clean this up so that all errors are handled in one spot.
success = ( n ! = null ) ;
}
catch ( MalformedGenomeLocException ex ) {
if ( firstFailure ) {
2009-06-05 23:49:03 +08:00
Utils . warnUser ( "Failed to parse contig on line '" + line + "'. The reason given was: " + ex . getMessage ( ) + " Skipping ahead to the next recognized GenomeLoc. " ) ;
2009-06-03 02:14:46 +08:00
firstFailure = false ;
}
if ( ! parser . hasNext ( ) )
Utils . warnUser ( "Unable to find more valid reference-ordered data. Giving up." ) ;
}
} while ( ! success & & parser . hasNext ( ) ) ;
return n ;
2009-03-16 06:37:20 +08:00
}
public void remove ( ) {
throw new UnsupportedOperationException ( ) ;
}
}
// ----------------------------------------------------------------------
//
// Parsing
//
// ----------------------------------------------------------------------
2009-07-06 00:28:24 +08:00
private Constructor < ROD > parsing_constructor ;
2009-05-15 05:06:28 +08:00
private ROD newROD ( final String name , final Class < ROD > type ) {
2009-03-16 06:37:20 +08:00
try {
2009-07-06 00:28:24 +08:00
return ( ROD ) parsing_constructor . newInstance ( name ) ;
2009-03-16 06:37:20 +08:00
} catch ( java . lang . InstantiationException e ) {
2009-05-15 05:06:28 +08:00
throw new RuntimeException ( e ) ;
2009-03-16 06:37:20 +08:00
} catch ( java . lang . IllegalAccessException e ) {
2009-05-15 05:06:28 +08:00
throw new RuntimeException ( e ) ;
2009-04-04 00:41:33 +08:00
} catch ( InvocationTargetException e ) {
2009-05-15 05:06:28 +08:00
throw new RuntimeException ( e ) ;
}
}
private Object initializeROD ( final String name , final File file , final Class < ROD > type ) {
2009-07-06 00:28:24 +08:00
try { parsing_constructor = type . getConstructor ( String . class ) ; }
catch ( java . lang . NoSuchMethodException e ) { throw new RuntimeException ( e ) ; }
2009-05-15 05:06:28 +08:00
ROD rod = newROD ( name , type ) ;
try {
return rod . initialize ( file ) ;
} catch ( FileNotFoundException e ) {
throw new RuntimeException ( e ) ;
}
}
private ROD parseLine ( final String [ ] parts ) {
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
ROD obj = newROD ( name , type ) ;
try {
if ( ! obj . parseLine ( header , parts ) )
obj = null ;
} catch ( IOException e ) {
throw new RuntimeException ( "Badly formed ROD: " + e ) ;
2009-04-04 00:41:33 +08:00
}
2009-05-15 05:06:28 +08:00
return obj ;
2009-03-16 06:37:20 +08:00
}
}