2009-03-16 06:37:20 +08:00
package org.broadinstitute.sting.gatk.refdata ;
2009-07-17 05:03:47 +08:00
import org.apache.log4j.Logger ;
2009-08-12 06:10:20 +08:00
import org.broadinstitute.sting.utils.StingException ;
2009-07-17 05:03:47 +08:00
import org.broadinstitute.sting.utils.Utils ;
2009-08-12 06:10:20 +08:00
import java.io.* ;
2009-05-21 23:23:22 +08:00
import java.lang.reflect.Method ;
2009-07-17 05:03:47 +08:00
import java.util.* ;
2009-03-16 06:37:20 +08:00
/ * *
* Class for representing arbitrary reference ordered data sets
2009-08-12 06:10:20 +08:00
* < p / >
2009-03-16 06:37:20 +08:00
* User : mdepristo
* Date : Feb 27 , 2009
* Time : 10 : 47 : 14 AM
* To change this template use File | Settings | File Templates .
* /
2009-09-22 00:55:22 +08:00
public class ReferenceOrderedData < ROD extends ReferenceOrderedDatum > implements Iterable < RODRecordList < ROD > > {
2009-04-04 00:41:33 +08:00
private String name ;
2009-03-16 06:37:20 +08:00
private File file = null ;
2009-09-22 00:55:22 +08:00
// private String fieldDelimiter;
2009-08-12 06:10:20 +08:00
/** Header object returned from the datum */
2009-09-22 00:55:22 +08:00
// private Object header = null;
2009-08-12 06:10:20 +08:00
2009-03-16 06:37:20 +08:00
private Class < ROD > type = null ; // runtime type information for object construction
2009-08-12 06:10:20 +08:00
/** our log, which we want to capture anything from this class */
private static Logger logger = Logger . getLogger ( ReferenceOrderedData . class ) ;
2009-04-10 06:04:59 +08:00
// ----------------------------------------------------------------------
//
// Static ROD type management
//
// ----------------------------------------------------------------------
public static class RODBinding {
public final String name ;
public final Class < ? extends ReferenceOrderedDatum > type ;
2009-08-12 06:10:20 +08:00
2009-04-10 06:04:59 +08:00
public RODBinding ( final String name , final Class < ? extends ReferenceOrderedDatum > type ) {
this . name = name ;
this . type = type ;
}
}
public static HashMap < String , RODBinding > Types = new HashMap < String , RODBinding > ( ) ;
2009-08-12 06:10:20 +08:00
2009-04-10 06:04:59 +08:00
public static void addModule ( final String name , final Class < ? extends ReferenceOrderedDatum > rodType ) {
2009-07-17 05:03:47 +08:00
final String boundName = name . toLowerCase ( ) ;
2009-08-12 06:10:20 +08:00
if ( Types . containsKey ( boundName ) ) {
2009-07-17 05:03:47 +08:00
throw new RuntimeException ( String . format ( "GATK BUG: adding ROD module %s that is already bound" , boundName ) ) ;
}
2009-09-05 03:13:37 +08:00
logger . info ( String . format ( "* Adding rod class %s" , name ) ) ;
2009-07-17 05:03:47 +08:00
Types . put ( boundName , new RODBinding ( name , rodType ) ) ;
2009-04-10 06:04:59 +08:00
}
static {
// All known ROD types
2009-09-05 02:40:43 +08:00
addModule ( "GFF" , RodGenotypeChipAsGFF . class ) ;
2009-04-10 06:04:59 +08:00
addModule ( "dbSNP" , rodDbSNP . class ) ;
addModule ( "HapMapAlleleFrequencies" , HapMapAlleleFrequenciesROD . class ) ;
addModule ( "SAMPileup" , rodSAMPileup . class ) ;
2009-07-09 11:16:27 +08:00
addModule ( "GELI" , rodGELI . class ) ;
2009-05-27 05:06:44 +08:00
addModule ( "RefSeq" , rodRefSeq . class ) ;
2009-05-15 05:06:28 +08:00
addModule ( "Table" , TabularROD . class ) ;
2009-05-28 06:02:24 +08:00
addModule ( "PooledEM" , PooledEMSNPROD . class ) ;
2009-08-31 12:32:32 +08:00
addModule ( "CleanedOutSNP" , CleanedOutSNPROD . class ) ;
2009-09-24 04:22:09 +08:00
addModule ( "Sequenom" , SequenomROD . class ) ;
2009-06-30 00:32:12 +08:00
addModule ( "SangerSNP" , SangerSNPROD . class ) ;
2009-07-18 00:05:51 +08:00
addModule ( "SimpleIndel" , SimpleIndelROD . class ) ;
2009-09-03 03:32:29 +08:00
addModule ( "PointIndel" , PointIndelROD . class ) ;
2009-07-06 00:28:24 +08:00
addModule ( "HapMapGenotype" , HapMapGenotypeROD . class ) ;
2009-06-06 07:34:37 +08:00
addModule ( "Intervals" , IntervalRod . class ) ;
2009-09-05 02:40:43 +08:00
addModule ( "Variants" , RodGeliText . class ) ;
2009-07-17 05:03:47 +08:00
addModule ( "GLF" , RodGLF . class ) ;
2009-09-24 23:16:11 +08:00
addModule ( "VCF" , RodVCF . class ) ;
2009-04-10 06:04:59 +08:00
}
/ * *
* Parse the ROD bindings . These are of the form of a single list of strings , each triplet of the
2009-05-08 23:28:19 +08:00
* form < name > , < type > , < file > . After this function , the List of RODs contains new RODs bound to each of
2009-04-10 06:04:59 +08:00
* name , of type , ready to read from the file . This function does check for the strings to be well formed
* and such .
*
* @param bindings
* @param rods
* /
2009-08-12 06:10:20 +08:00
public static void parseBindings ( ArrayList < String > bindings , List < ReferenceOrderedData < ? extends ReferenceOrderedDatum > > rods ) {
// pre-process out any files that were passed in as rod binding command line options
for ( int x = 0 ; x < bindings . size ( ) ; x + + ) {
if ( new File ( bindings . get ( x ) ) . exists ( ) ) {
extractRodsFromFile ( bindings , bindings . get ( x ) ) ;
bindings . remove ( x ) ;
x - - ;
}
}
2009-04-10 06:04:59 +08:00
// Loop over triplets
2009-08-12 06:10:20 +08:00
for ( String bindingSets : bindings ) {
2009-05-28 06:02:24 +08:00
String [ ] bindingTokens = bindingSets . split ( "," ) ;
2009-08-12 06:10:20 +08:00
if ( bindingTokens . length % 3 ! = 0 )
2009-05-08 23:28:19 +08:00
Utils . scareUser ( String . format ( "Invalid ROD specification: requires triplets of <name>,<type>,<file> but got %s" , Utils . join ( "," , bindings ) ) ) ;
2009-08-12 06:10:20 +08:00
for ( int bindingSet = 0 ; bindingSet < bindingTokens . length ; bindingSet + = 3 ) {
2009-05-28 06:02:24 +08:00
logger . info ( "Processing ROD bindings: " + bindings . size ( ) + " -> " + Utils . join ( " : " , bindingTokens ) ) ;
2009-04-10 06:04:59 +08:00
2009-05-28 06:02:24 +08:00
final String name = bindingTokens [ bindingSet ] ;
final String typeName = bindingTokens [ bindingSet + 1 ] ;
final String fileName = bindingTokens [ bindingSet + 2 ] ;
2009-04-10 06:04:59 +08:00
2009-08-12 06:10:20 +08:00
ReferenceOrderedData < ? > rod = parse1Binding ( name , typeName , fileName ) ;
2009-04-10 06:04:59 +08:00
2009-05-28 06:02:24 +08:00
// check that we're not generating duplicate bindings
2009-08-12 06:10:20 +08:00
for ( ReferenceOrderedData rod2 : rods )
if ( rod2 . getName ( ) . equals ( rod . getName ( ) ) )
2009-05-28 06:02:24 +08:00
Utils . scareUser ( String . format ( "Found duplicate rod bindings" , rod . getName ( ) ) ) ;
rods . add ( rod ) ;
}
2009-04-10 06:04:59 +08:00
}
}
2009-08-12 06:10:20 +08:00
/ * *
* given a existing file , open it and append all the valid triplet lines to an existing list
*
* @param rodTripletList the list of existing triplets
* @param filename the file to attempt to extract ROD triplets from
* /
protected static void extractRodsFromFile ( List < String > rodTripletList , String filename ) {
BufferedReader str ;
try {
str = new BufferedReader ( new FileReader ( new File ( filename ) ) ) ;
} catch ( FileNotFoundException e ) {
throw new StingException ( "Unable to load the ROD input file " + filename , e ) ;
}
String line = "NO LINES READ IN" ;
try {
while ( ( line = str . readLine ( ) ) ! = null ) {
if ( line . matches ( ".+,.+,.+" ) ) rodTripletList . add ( line . trim ( ) ) ;
else logger . warn ( "the following file line didn't parsing into a triplet -> " + line ) ;
}
} catch ( IOException e ) {
throw new StingException ( "Failed reading the input rod file " + filename + " last line read was " + line , e ) ;
}
}
2009-04-10 06:04:59 +08:00
/ * *
* Helpful function that parses a single triplet of < name > < type > < file > and returns the corresponding ROD with
* < name > , of type < type > that reads its input from < file > .
2009-08-12 06:10:20 +08:00
*
2009-04-10 06:04:59 +08:00
* @param trackName
* @param typeName
* @param fileName
* @return
* /
2009-08-12 06:10:20 +08:00
private static ReferenceOrderedData < ? > parse1Binding ( final String trackName , final String typeName , final String fileName ) {
2009-04-10 06:04:59 +08:00
// Gracefully fail if we don't have the type
2009-08-12 06:10:20 +08:00
if ( ReferenceOrderedData . Types . get ( typeName . toLowerCase ( ) ) = = null )
2009-04-10 06:04:59 +08:00
Utils . scareUser ( String . format ( "Unknown ROD type: %s" , typeName ) ) ;
// Lookup the type
Class rodClass = ReferenceOrderedData . Types . get ( typeName . toLowerCase ( ) ) . type ;
// Create the ROD
ReferenceOrderedData < ? > rod = new ReferenceOrderedData < ReferenceOrderedDatum > ( trackName . toLowerCase ( ) , new File ( fileName ) , rodClass ) ;
logger . info ( String . format ( "Created binding from %s to %s of type %s" , trackName . toLowerCase ( ) , fileName , rodClass ) ) ;
return rod ;
}
// ----------------------------------------------------------------------
//
// Constructors
//
// ----------------------------------------------------------------------
2009-04-04 00:41:33 +08:00
public ReferenceOrderedData ( final String name , File file , Class < ROD > type ) {
2009-03-16 06:37:20 +08:00
this . file = file ;
this . type = type ;
2009-04-04 00:41:33 +08:00
this . name = name ;
2009-09-22 00:55:22 +08:00
// this.header = initializeROD(name, file, type);
// this.fieldDelimiter = newROD(name, type).delimiterRegex();
2009-03-16 06:37:20 +08:00
}
2009-04-04 03:54:54 +08:00
public String getName ( ) { return name ; }
2009-05-20 07:26:17 +08:00
/ * *
2009-05-24 04:50:28 +08:00
* Special equals override to see if this ROD is compatible with the given
* name and type . ' Compatible ' means that this ROD has the name that ' s passed
* in and its data can fit into the container specified by type .
2009-08-12 06:10:20 +08:00
*
2009-05-20 07:26:17 +08:00
* @param name Name to check .
* @param type Type to check .
2009-08-12 06:10:20 +08:00
*
2009-05-20 07:26:17 +08:00
* @return True if these parameters imply this rod . False otherwise .
* /
2009-08-12 06:10:20 +08:00
public boolean matches ( String name , Class < ? extends ReferenceOrderedDatum > type ) {
2009-05-24 04:50:28 +08:00
return this . name . equals ( name ) & & type . isAssignableFrom ( this . type ) ;
2009-05-20 07:26:17 +08:00
}
2009-09-22 00:55:22 +08:00
public SeekableRODIterator < ROD > iterator ( ) {
2009-08-12 06:10:20 +08:00
Iterator < ROD > it ;
2009-05-21 23:23:22 +08:00
try {
2009-08-12 06:10:20 +08:00
Method m = type . getDeclaredMethod ( "createIterator" , String . class , java . io . File . class ) ;
2009-05-21 23:23:22 +08:00
it = ( Iterator < ROD > ) m . invoke ( null , name , file ) ;
2009-08-12 06:10:20 +08:00
} catch ( java . lang . NoSuchMethodException e ) {
2009-09-22 00:55:22 +08:00
it = new RODRecordIterator ( file , name , type ) ;
2009-08-12 06:10:20 +08:00
} catch ( java . lang . NullPointerException e ) {
2009-05-21 23:23:22 +08:00
throw new RuntimeException ( e ) ;
2009-08-12 06:10:20 +08:00
} catch ( java . lang . SecurityException e ) {
2009-05-21 23:23:22 +08:00
throw new RuntimeException ( e ) ;
2009-08-12 06:10:20 +08:00
} catch ( java . lang . IllegalAccessException e ) {
throw new RuntimeException ( e ) ;
} catch ( java . lang . IllegalArgumentException e ) {
throw new RuntimeException ( e ) ;
} catch ( java . lang . reflect . InvocationTargetException e ) {
throw new RuntimeException ( e ) ;
}
2009-09-22 00:55:22 +08:00
// return new RODIterator<ROD>(it);
return new SeekableRODIterator ( it ) ;
2009-08-12 06:10:20 +08:00
}
2009-03-16 06:37:20 +08:00
// ----------------------------------------------------------------------
//
// Testing
//
// ----------------------------------------------------------------------
public void testMe ( ) {
2009-09-22 00:55:22 +08:00
for ( RODRecordList < ROD > rec : this ) {
System . out . println ( rec . getRecords ( ) . get ( 0 ) . toString ( ) ) ;
2009-04-02 06:54:38 +08:00
2009-09-22 00:55:22 +08:00
RodGenotypeChipAsGFF gff = ( RodGenotypeChipAsGFF ) rec . getRecords ( ) . get ( 0 ) ;
2009-04-02 06:54:38 +08:00
String [ ] keys = { "LENGTH" , "ALT" , "FOBARBAR" } ;
2009-08-12 06:10:20 +08:00
for ( String key : keys ) {
2009-04-02 06:54:38 +08:00
System . out . printf ( " -> %s is (%s)%n" , key , gff . containsAttribute ( key ) ? gff . getAttribute ( key ) : "none" ) ;
2009-03-16 06:37:20 +08:00
}
}
System . exit ( 1 ) ;
}
// ----------------------------------------------------------------------
//
// Manipulations of all of the data
//
// ----------------------------------------------------------------------
public ArrayList < ReferenceOrderedDatum > readAll ( ) {
ArrayList < ReferenceOrderedDatum > elts = new ArrayList < ReferenceOrderedDatum > ( ) ;
2009-09-22 00:55:22 +08:00
for ( RODRecordList < ROD > l : this ) {
for ( ReferenceOrderedDatum rec : l ) {
elts . add ( rec ) ;
}
2009-03-16 06:37:20 +08:00
}
elts . trimToSize ( ) ;
return elts ;
}
public static void sortRODDataInMemory ( ArrayList < ReferenceOrderedDatum > data ) {
Collections . sort ( data ) ;
}
public static void write ( ArrayList < ReferenceOrderedDatum > data , File output ) throws IOException {
final FileWriter out = new FileWriter ( output ) ;
2009-08-12 06:10:20 +08:00
for ( ReferenceOrderedDatum rec : data ) {
2009-03-16 06:37:20 +08:00
out . write ( rec . repl ( ) + "\n" ) ;
}
out . close ( ) ;
}
public boolean validateFile ( ) throws Exception {
ReferenceOrderedDatum last = null ;
2009-09-22 00:55:22 +08:00
for ( RODRecordList < ROD > l : this ) {
for ( ReferenceOrderedDatum rec : l ) {
if ( last ! = null & & last . compareTo ( rec ) > 1 ) {
// It's out of order
throw new Exception ( "Out of order elements at \n" + last . toString ( ) + "\n" + rec . toString ( ) ) ;
}
last = rec ;
2009-08-12 06:10:20 +08:00
}
2009-03-16 06:37:20 +08:00
}
return true ;
}
public void indexFile ( ) {
// Fixme -- get access to the linear index system from Jim
}
// ----------------------------------------------------------------------
//
// Iteration
//
// ----------------------------------------------------------------------
2009-09-22 00:55:22 +08:00
// private class SimpleRODIterator implements Iterator<ROD> {
// private xReadLines parser = null;
//
// public SimpleRODIterator() {
// try {
// parser = new xReadLines(file);
// } catch (FileNotFoundException e) {
// Utils.scareUser("Couldn't open file: " + file);
// }
// }
//
// public boolean hasNext() {
// //System.out.printf("Parser has next: %b%n", parser.hasNext());
// return parser.hasNext();
// }
//
// public ROD next() {
// ROD n = null;
// boolean success = false;
// boolean firstFailure = true;
//
// do {
// final String line = parser.next();
// //System.out.printf("Line is '%s'%n", line);
// String parts[] = line.split(fieldDelimiter);
//
// try {
// n = parseLine(parts);
// // Two failure conditions:
// // 1) parseLine throws an exception.
// // 2) parseLine returns null.
// // 3) parseLine throws a RuntimeException.
// // TODO: Clean this up so that all errors are handled in one spot.
// success = (n != null);
// }
// catch (MalformedGenomeLocException ex) {
// if (firstFailure) {
// Utils.warnUser("Failed to parse contig on line '" + line + "'. The reason given was: " + ex.getMessage() + " Skipping ahead to the next recognized GenomeLoc. ");
// firstFailure = false;
// }
// if (!parser.hasNext())
// Utils.warnUser("Unable to find more valid reference-ordered data. Giving up.");
// }
//
// } while (!success && parser.hasNext());
//
// return n;
// }
//
// public void remove() {
// throw new UnsupportedOperationException();
// }
// }
2009-03-16 06:37:20 +08:00
// ----------------------------------------------------------------------
//
// Parsing
//
// ----------------------------------------------------------------------
2009-09-22 00:55:22 +08:00
// private Constructor<ROD> parsing_constructor;
// private ROD newROD(final String name, final Class<ROD> type) {
// try {
// return (ROD) parsing_constructor.newInstance(name);
// } catch (java.lang.InstantiationException e) {
// throw new RuntimeException(e);
// } catch (java.lang.IllegalAccessException e) {
// throw new RuntimeException(e);
// } catch (InvocationTargetException e) {
// throw new RuntimeException(e);
// }
// }
// private Object initializeROD(final String name, final File file, final Class<ROD> type) {
// try {
// parsing_constructor = type.getConstructor(String.class);
// }
// catch (java.lang.NoSuchMethodException e) {
// throw new RuntimeException(e);
// }
// ROD rod = newROD(name, type);
// try {
// return rod.initialize(file);
// } catch (FileNotFoundException e) {
// throw new RuntimeException(e);
// }
// }
// private ROD parseLine(final String[] parts) {
// //System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
// ROD obj = newROD(name, type);
// try {
// if (!obj.parseLine(header, parts))
// obj = null;
// } catch (IOException e) {
// throw new RuntimeException("Badly formed ROD: " + e);
// }
// return obj;
// }
2009-03-16 06:37:20 +08:00
}