Phase II support for generic reference order data set parsing.

Can read GFF and dbSNP files correctly.  
Traversal engine now supports keeping moving window of reference ordered data along with the locus iterator.
Tested by walking through a sam file keeping track of the dbSNP positions encountered -- they definitely look right...


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@13 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-02-28 20:47:48 +00:00
parent dee840efa5
commit f64f3e2d90
10 changed files with 465 additions and 146 deletions

View File

@ -7,9 +7,13 @@ import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.sting.atk.modules.*;
import edu.mit.broad.sting.utils.ReferenceOrderedData;
import edu.mit.broad.sting.utils.rodGFF;
import edu.mit.broad.sting.utils.rodDbSNP;
import java.io.*;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
public class AnalysisTK extends CommandLineProgram {
// Usage and parameters
@ -44,10 +48,23 @@ public class AnalysisTK extends CommandLineProgram {
}
protected int doWork() {
ReferenceOrderedData rod = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"));
rod.testMe();
final boolean TEST_ROD = false;
ReferenceOrderedData[] rods = null;
this.engine = new TraversalEngine(INPUT_FILE, REF_FILE_ARG);
if ( TEST_ROD ) {
ReferenceOrderedData gff = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"), rodGFF.class );
gff.testMe();
//ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class );
ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt"), rodDbSNP.class );
//dbsnp.testMe();
rods = new ReferenceOrderedData[] { dbsnp }; // { gff, dbsnp };
}
else {
rods = new ReferenceOrderedData[] {}; // { gff, dbsnp };
}
this.engine = new TraversalEngine(INPUT_FILE, REF_FILE_ARG, rods);
ValidationStringency strictness;
if ( STRICTNESS_ARG == null ) {

View File

@ -1,6 +1,9 @@
package edu.mit.broad.sting.atk;
import edu.mit.broad.sting.atk.LocusIterator;
import edu.mit.broad.sting.utils.ReferenceOrderedDatum;
import java.util.List;
/**
* Created by IntelliJ IDEA.
@ -14,10 +17,10 @@ public interface LocusWalker<MapType, ReduceType> {
public String walkerType();
// Do we actually want to operate on the context?
boolean filter(char ref, LocusIterator context);
boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context);
// Map over the edu.mit.broad.sting.atk.LocusContext
MapType map(char ref, LocusIterator context);
MapType map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context);
// Given result of map function
ReduceType reduceInit();

View File

@ -9,13 +9,21 @@ import edu.mit.broad.picard.filter.FilteringIterator;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
import edu.mit.broad.sting.utils.ReferenceIterator;
import edu.mit.broad.sting.utils.ReferenceOrderedData;
import edu.mit.broad.sting.utils.ReferenceOrderedDatum;
import java.io.*;
import java.util.List;
import java.util.Iterator;
import java.util.ArrayList;
import java.util.Arrays;
public class TraversalEngine {
// Usage and parameters
private File readsFile = null;
private File refFileName = null;
private List<ReferenceOrderedData> rods = null;
private String regionStr = null;
private String traversalType = null;
private ValidationStringency strictness = ValidationStringency.STRICT;
@ -42,9 +50,10 @@ public class TraversalEngine {
// Setting up the engine
//
// --------------------------------------------------------------------------------------------------------------
public TraversalEngine(File reads, File ref) {
public TraversalEngine(File reads, File ref, ReferenceOrderedData[] rods ) {
readsFile = reads;
refFileName = ref;
this.rods = Arrays.asList(rods);
}
public void setRegion(final String reg) { regionStr = regionStr; }
@ -100,6 +109,30 @@ public class TraversalEngine {
System.exit(1);
}
// --------------------------------------------------------------------------------------------------------------
//
// dealing with reference ordered data
//
// --------------------------------------------------------------------------------------------------------------
protected List<ReferenceOrderedData.RODIterator> initializeRODs() {
// set up reference ordered data
List<ReferenceOrderedData.RODIterator> rodIters = new ArrayList<ReferenceOrderedData.RODIterator>();
for ( ReferenceOrderedData data : rods ) {
rodIters.add(data.iterator());
}
return rodIters;
}
protected List<ReferenceOrderedDatum> getReferenceOrderedDataAtLocus(List<ReferenceOrderedData.RODIterator> rodIters,
final String contig, final int pos) {
List<ReferenceOrderedDatum> data = new ArrayList<ReferenceOrderedDatum>();
for ( ReferenceOrderedData.RODIterator iter : rodIters ) {
data.add(iter.seekForward(contig, pos));
}
return data;
}
// --------------------------------------------------------------------------------------------------------------
//
// traversal functions
@ -159,6 +192,8 @@ public class TraversalEngine {
FilteringIterator filterIter = new FilteringIterator(readStream.iterator(), new locusStreamFilterFunc());
CloseableIterator<LocusIterator> iter = new LocusIterator(filterIter);
List<ReferenceOrderedData.RODIterator> rodIters = initializeRODs();
T sum = walker.reduceInit();
while ( iter.hasNext() ) {
this.nRecords++;
@ -167,13 +202,14 @@ public class TraversalEngine {
final LocusIterator locus = iter.next();
final ReferenceIterator refSite = refIter.seekForward(locus.getContig(), locus.getPosition());
final char refBase = refSite.getBaseAsChar();
final List<ReferenceOrderedDatum> rodData = getReferenceOrderedDataAtLocus(rodIters, locus.getContig(), locus.getPosition());
if ( DEBUGGING )
System.out.printf(" Reference: %s:%d %c%n", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase);
final boolean keepMeP = walker.filter(refBase, locus);
final boolean keepMeP = walker.filter(rodData, refBase, locus);
if ( keepMeP ) {
M x = walker.map(refBase, locus);
M x = walker.map(rodData, refBase, locus);
sum = walker.reduce(x, sum);
}

View File

@ -2,8 +2,11 @@ package edu.mit.broad.sting.atk.modules;
import edu.mit.broad.sting.atk.LocusWalker;
import edu.mit.broad.sting.atk.LocusIterator;
import edu.mit.broad.sting.utils.ReferenceOrderedDatum;
import edu.mit.broad.sam.SAMRecord;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
@ -18,12 +21,12 @@ public class EmptyLocusWalker implements LocusWalker<Integer, Integer> {
public String walkerType() { return "ByLocus"; }
// Do we actually want to operate on the context?
public boolean filter(char ref, LocusIterator context) {
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
return true; // We are keeping all the reads
}
// Map over the edu.mit.broad.sting.atk.LocusContext
public Integer map(char ref, LocusIterator context) {
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
return 1;
}

View File

@ -2,6 +2,7 @@ package edu.mit.broad.sting.atk.modules;
import edu.mit.broad.sting.atk.LocusWalker;
import edu.mit.broad.sting.atk.LocusIterator;
import edu.mit.broad.sting.utils.ReferenceOrderedDatum;
import edu.mit.broad.sam.SAMRecord;
import java.util.List;
@ -20,12 +21,12 @@ public class PileupWalker implements LocusWalker<Integer, Integer> {
public String walkerType() { return "ByLocus"; }
// Do we actually want to operate on the context?
public boolean filter(char ref, LocusIterator context) {
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
return true; // We are keeping all the reads
}
// Map over the edu.mit.broad.sting.atk.LocusContext
public Integer map(char ref, LocusIterator context) {
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
//System.out.printf("Reads %s:%d %d%n", context.getContig(), context.getPosition(), context.getReads().size());
//for ( SAMRecord read : context.getReads() ) {
// System.out.println(" -> " + read.getReadName());
@ -49,8 +50,18 @@ public class PileupWalker implements LocusWalker<Integer, Integer> {
//System.out.printf(" [%2d] [%s] %s%n", offset, read.getReadString().charAt(offset), read.getReadString());
}
if ( context.getPosition() % 10 == 0 )
System.out.printf("%s:%d: %s %s %s%n", context.getContig(), context.getPosition(), ref, bases, quals);
String rodString = "";
for ( ReferenceOrderedDatum datum : rodData ) {
if ( datum != null ) {
rodString += datum.toSimpleString();
}
}
if ( rodString != "" )
rodString = "[ROD: " + rodString + "]";
if ( context.getPosition() % 1 == 0 ) {
System.out.printf("%s:%d: %s %s %s %s%n", context.getContig(), context.getPosition(), ref, bases, quals, rodString);
}
//for ( int offset : context.getOffsets() ) {
// System.out.println(" -> " + read.getReadName());

View File

@ -1,15 +1,7 @@
package edu.mit.broad.sting.utils;
import edu.mit.broad.sam.SAMRecord;
import edu.mit.broad.sam.util.CloseableIterator;
import edu.mit.broad.picard.util.TabbedTextFileParser;
import java.io.File;
import java.io.InputStream;
import java.io.FileInputStream;
import java.io.BufferedInputStream;
import java.util.Iterator;
import java.util.HashMap;
/**
* Class for representing arbitrary reference ordered data sets
@ -19,40 +11,17 @@ import java.util.HashMap;
* Time: 10:47:14 AM
* To change this template use File | Settings | File Templates.
*/
public class ReferenceOrderedData implements Iterable<ReferenceOrderedDatum> {
public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements Iterable<ROD> {
private File file = null;
private Class<ROD> type = null; // runtime type information for object construction
public ReferenceOrderedData(File file) {
public ReferenceOrderedData(File file, Class<ROD> type ) {
this.file = file;
}
// ----------------------------------------------------------------------
//
// Iteration
//
// ----------------------------------------------------------------------
private class RODIterator implements Iterator<ReferenceOrderedDatum> {
TabbedTextFileParser parser = null;
public RODIterator() {
parser = new TabbedTextFileParser(true, file);
}
public boolean hasNext() {
return parser.hasNext();
}
public ReferenceOrderedDatum next() {
String parts[] = parser.next();
return parseGFFLine(parts);
}
public void remove () {
throw new UnsupportedOperationException();
}
this.type = type;
}
public RODIterator iterator() {
return new RODIterator();
return new RODIterator(new SimpleRODIterator());
}
// ----------------------------------------------------------------------
@ -66,28 +35,112 @@ public class ReferenceOrderedData implements Iterable<ReferenceOrderedDatum> {
}
}
// ----------------------------------------------------------------------
//
// Iteration
//
// ----------------------------------------------------------------------
private class SimpleRODIterator implements Iterator<ROD> {
private WhitespaceTextFileParser parser = null;
public SimpleRODIterator() {
parser = new WhitespaceTextFileParser(true, file);
}
public boolean hasNext() {
return parser.hasNext();
}
public ROD next() {
String parts[] = parser.next();
return parseLine(parts);
}
public void remove() {
throw new UnsupportedOperationException();
}
}
public class RODIterator implements Iterator<ROD> {
private PushbackIterator<ROD> it;
private ROD prev = null;
public RODIterator(SimpleRODIterator it) {
this.it = new PushbackIterator<ROD>(it);
}
public boolean hasNext() { return it.hasNext(); }
public ROD next() {
prev = it.next();
return prev;
}
//
// Seeks forward in the file until we reach (or cross) a record at contig / pos
// If we don't find anything and cross beyond contig / pos, we return null
// Otherwise we return the first object who's start is at pos
//
public ROD seekForward(final String contigName, final int pos) {
ROD result = null;
//System.out.printf(" *** starting seek to %s %d %s%n", contigName, pos, prev);
while ( hasNext() ) {
ROD current = next();
//System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart());
int strCmp = contigName.compareTo( prev.getContig() );
if ( strCmp == 0 ) {
// The contigs are equal
if ( current.getStart() > pos ) {
// There was nothing to find, push back next and return null
it.pushback(current);
break;
}
else if ( pos == current.getStart() ) {
// We found a record at contig / pos, return it
result = current;
break;
}
}
else if ( strCmp < 0 ) {
// We've gone past the desired contig, break
break;
}
}
/*
if ( result == null )
;
//System.out.printf(" --- seek result to %s %d is NULL%n", contigName, pos);
else
System.out.printf(" ### Found %s %d%n", result.getContig(), result.getStart());
*/
// we ran out of elements or found something
return result;
}
public void remove() {
throw new UnsupportedOperationException();
}
}
// ----------------------------------------------------------------------
//
// Parsing
//
// ----------------------------------------------------------------------
ReferenceOrderedDatum parseGFFLine(final String[] parts) {
ROD parseLine(final String[] parts) {
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
final String contig = parts[0];
final String source = parts[1];
final String feature = parts[2];
final long start = Long.parseLong(parts[3]);
final long stop = Long.parseLong(parts[4]);
double score = Double.NaN;
if ( ! parts[5].equals(".") )
score = Double.parseDouble(parts[5]);
final String strand = parts[6];
final String frame = parts[7];
HashMap<String, String> attributes = null;
return new ReferenceOrderedDatum(contig, source, feature, start, stop, score, strand, frame, attributes);
try {
ROD obj = type.newInstance();
obj.parseLine(parts);
return obj;
} catch ( java.lang.InstantiationException e ) {
System.out.println(e);
return null; // wow, unsafe!
} catch ( java.lang.IllegalAccessException e ) {
System.out.println(e);
return null; // wow, unsafe!
}
}
}

View File

@ -1,7 +1,5 @@
package edu.mit.broad.sting.utils;
import java.util.HashMap;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
@ -9,85 +7,15 @@ import java.util.HashMap;
* Time: 10:49:47 AM
* To change this template use File | Settings | File Templates.
*/
public class ReferenceOrderedDatum {
private String contig, source, feature, strand, frame;
private long start, stop;
private double score;
private HashMap<String, String> attributes;
public abstract class ReferenceOrderedDatum {
public ReferenceOrderedDatum() { }
// ----------------------------------------------------------------------
//
// Constructors
//
// ----------------------------------------------------------------------
public ReferenceOrderedDatum(final String contig, final String source, final String feature,
final long start, final long stop, final double score,
final String strand, final String frame, HashMap<String, String> attributes) {
this.contig = contig;
this.source = source;
this.feature = feature;
this.start = start;
this.stop= stop;
this.score = score;
this.strand = strand;
this.frame = frame;
this.attributes = attributes;
}
public abstract void parseLine(final String[] parts);
// public ReferenceOrderedDatum(final String contig, final String source, final String feature,
// final long start, final long stop, final double score,
// final String strand, final String frame) {
// ReferenceOrderedDatum(contig, source, feature, start, stop, score, strand, frame, null);
// }
// ----------------------------------------------------------------------
//
// Accessors
//
// ----------------------------------------------------------------------
public String getContig() {
return this.contig;
}
public String getSource() {
return source;
}
public String getFeature() {
return feature;
}
public String getStrand() {
return strand;
}
public String getFrame() {
return frame;
}
public long getStart() {
return start;
}
public long getStop() {
return stop;
}
public double getScore() {
return score;
}
public String getAttribute(final String key) {
return attributes.get(key);
}
// ----------------------------------------------------------------------
//
// formatting
//
// ----------------------------------------------------------------------
public String toString() {
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s", contig, source, feature, start, stop, score, strand, frame);
}
public abstract String toString();
public abstract String toSimpleString();
public abstract String getContig();
public abstract long getStart();
public abstract long getStop();
}

View File

@ -0,0 +1,43 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.sting.utils;
import edu.mit.broad.picard.util.BasicTextFileParser;
import java.io.File;
import java.util.List;
import java.util.Arrays;
/**
* Parser for tab-delimited files
*
* @author Kathleen Tibbetts
*/
public class WhitespaceTextFileParser extends BasicTextFileParser {
/**
* Constructor
*
* @param file The file to parse
*/
public WhitespaceTextFileParser(boolean treatGroupedDelimitersAsOne, File... file) {
super(treatGroupedDelimitersAsOne, file);
}
/**
* Determines whether a given character is a delimiter
*
* @param b the character to evaluate
* @return true if <code>b</code> is a delimiter; otherwise false
*/
protected boolean isDelimiter(byte b) {
return Character.isWhitespace((char)b);
}
}

View File

@ -0,0 +1,102 @@
package edu.mit.broad.sting.utils;
import edu.mit.broad.sam.SAMRecord;
import edu.mit.broad.sam.util.CloseableIterator;
import edu.mit.broad.picard.util.TabbedTextFileParser;
import java.io.File;
import java.io.InputStream;
import java.io.FileInputStream;
import java.io.BufferedInputStream;
import java.util.Iterator;
import java.util.HashMap;
/**
* Example format:
* 585 chr1 433 433 rs56289060 0 + - - -/C genomic insertion unknown 0 0 unknown between 1
* 585 chr1 491 492 rs55998931 0 + C C C/T genomic single unknown 0 0 unknown exact 1
*
* User: mdepristo
* Date: Feb 27, 2009
* Time: 10:47:14 AM
* To change this template use File | Settings | File Templates.
*/
public class rodDbSNP extends ReferenceOrderedDatum {
public String contig; // Reference sequence chromosome or scaffold
public long start, stop; // Start and stop positions in chrom
public String name; // Reference SNP identifier or Affy SNP name
public String strand; // Which DNA strand contains the observed alleles
public String observed; // The sequences of the observed alleles from rs-fasta files
public char[] observedBases; // The sequences of the observed alleles from rs-fasta files
public String molType; // Sample type from exemplar ss
public String varType; // The class of variant (simple, insertion, deletion, range, etc.)
// Can be 'unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion'
public String validationStatus; // The validation status of the SNP
// one of set('unknown','by-cluster','by-frequency','by-submitter','by-2hit-2allele','by-hapmap')
public double avHet; // The average heterozygosity from all observations
public double avHetSE; // The Standard Error for the average heterozygosity
public String func; // The functional category of the SNP (coding-synon, coding-nonsynon, intron, etc.)
// set('unknown','coding-synon','intron','cds-reference','near-gene-3','near-gene-5',
// 'nonsense','missense','frameshift','untranslated-3','untranslated-5','splice-3','splice-5')
public String locType; // How the variant affects the reference sequence
// enum('range','exact','between','rangeInsertion','rangeSubstitution','rangeDeletion')
public int weight; // The quality of the alignment
// ----------------------------------------------------------------------
//
// Constructors
//
// ----------------------------------------------------------------------
public rodDbSNP() {}
public String getContig() { return this.contig; }
public long getStart() { return start; }
public long getStop() { return stop; }
// ----------------------------------------------------------------------
//
// formatting
//
// ----------------------------------------------------------------------
public String toString() {
return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
contig, start, stop, name, strand, observed, molType,
varType, validationStatus, avHet, avHetSE, func, locType, weight );
}
public String toSimpleString() {
return String.format("%s:%s", name, observed);
}
public void parseLine(final String[] parts) {
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" <=> ", parts));
contig = parts[1];
start = Long.parseLong(parts[2]) + 1; // The final is 0 based
stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
name = parts[4];
strand = parts[5];
observed = parts[9];
molType = parts[10];
varType = parts[11];
validationStatus = parts[12];
avHet = Double.parseDouble(parts[13]);
avHetSE = Double.parseDouble(parts[14]);
func = parts[15];
locType = parts[16];
weight = Integer.parseInt(parts[17]);
// Cut up the observed bases string into an array of individual bases
String[] bases = observed.split("/");
observedBases = new char[bases.length];
for ( String elt : bases ) {
observedBases[0] = (char)elt.getBytes()[0];
//System.out.printf(" Bases %s %d %c%n", elt, elt.getBytes()[0], (char)elt.getBytes()[0]);
}
//System.out.printf(" => Observed bases are %s%n", Utils.join(" B ", bases));
}
}

View File

@ -0,0 +1,123 @@
package edu.mit.broad.sting.utils;
import edu.mit.broad.sam.SAMRecord;
import edu.mit.broad.sam.util.CloseableIterator;
import edu.mit.broad.picard.util.TabbedTextFileParser;
import java.io.File;
import java.io.InputStream;
import java.io.FileInputStream;
import java.io.BufferedInputStream;
import java.util.Iterator;
import java.util.HashMap;
/**
* Class for representing arbitrary reference ordered data sets
*
* User: mdepristo
* Date: Feb 27, 2009
* Time: 10:47:14 AM
* To change this template use File | Settings | File Templates.
*/
public class rodGFF extends ReferenceOrderedDatum {
private String contig, source, feature, strand, frame;
private long start, stop;
private double score;
private HashMap<String, String> attributes;
// ----------------------------------------------------------------------
//
// Constructors
//
// ----------------------------------------------------------------------
public rodGFF() {
}
public void setValues(final String contig, final String source, final String feature,
final long start, final long stop, final double score,
final String strand, final String frame, HashMap<String, String> attributes) {
this.contig = contig;
this.source = source;
this.feature = feature;
this.start = start;
this.stop= stop;
this.score = score;
this.strand = strand;
this.frame = frame;
this.attributes = attributes;
}
// ----------------------------------------------------------------------
//
// Accessors
//
// ----------------------------------------------------------------------
public String getContig() {
return this.contig;
}
public String getSource() {
return source;
}
public String getFeature() {
return feature;
}
public String getStrand() {
return strand;
}
public String getFrame() {
return frame;
}
public long getStart() {
return start;
}
public long getStop() {
return stop;
}
public double getScore() {
return score;
}
public String getAttribute(final String key) {
return attributes.get(key);
}
// ----------------------------------------------------------------------
//
// formatting
//
// ----------------------------------------------------------------------
public String toString() {
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s", contig, source, feature, start, stop, score, strand, frame);
}
public String toSimpleString() {
return String.format("%s", feature);
}
public void parseLine(final String[] parts) {
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
final String contig = parts[0];
final String source = parts[1];
final String feature = parts[2];
final long start = Long.parseLong(parts[3]);
final long stop = Long.parseLong(parts[4]);
double score = Double.NaN;
if ( ! parts[5].equals(".") )
score = Double.parseDouble(parts[5]);
final String strand = parts[6];
final String frame = parts[7];
HashMap<String, String> attributes = null;
setValues(contig, source, feature, start, stop, score, strand, frame, attributes);
}
}