Phase II support for generic reference order data set parsing.
Can read GFF and dbSNP files correctly. Traversal engine now supports keeping moving window of reference ordered data along with the locus iterator. Tested by walking through a sam file keeping track of the dbSNP positions encountered -- they definitely look right... git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@13 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
dee840efa5
commit
f64f3e2d90
|
|
@ -7,9 +7,13 @@ import edu.mit.broad.picard.cmdline.Option;
|
|||
|
||||
import edu.mit.broad.sting.atk.modules.*;
|
||||
import edu.mit.broad.sting.utils.ReferenceOrderedData;
|
||||
import edu.mit.broad.sting.utils.rodGFF;
|
||||
import edu.mit.broad.sting.utils.rodDbSNP;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class AnalysisTK extends CommandLineProgram {
|
||||
// Usage and parameters
|
||||
|
|
@ -44,10 +48,23 @@ public class AnalysisTK extends CommandLineProgram {
|
|||
}
|
||||
|
||||
protected int doWork() {
|
||||
ReferenceOrderedData rod = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"));
|
||||
rod.testMe();
|
||||
final boolean TEST_ROD = false;
|
||||
ReferenceOrderedData[] rods = null;
|
||||
|
||||
this.engine = new TraversalEngine(INPUT_FILE, REF_FILE_ARG);
|
||||
if ( TEST_ROD ) {
|
||||
ReferenceOrderedData gff = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"), rodGFF.class );
|
||||
gff.testMe();
|
||||
|
||||
//ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class );
|
||||
ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt"), rodDbSNP.class );
|
||||
//dbsnp.testMe();
|
||||
rods = new ReferenceOrderedData[] { dbsnp }; // { gff, dbsnp };
|
||||
}
|
||||
else {
|
||||
rods = new ReferenceOrderedData[] {}; // { gff, dbsnp };
|
||||
}
|
||||
|
||||
this.engine = new TraversalEngine(INPUT_FILE, REF_FILE_ARG, rods);
|
||||
|
||||
ValidationStringency strictness;
|
||||
if ( STRICTNESS_ARG == null ) {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
package edu.mit.broad.sting.atk;
|
||||
|
||||
import edu.mit.broad.sting.atk.LocusIterator;
|
||||
import edu.mit.broad.sting.utils.ReferenceOrderedDatum;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -14,10 +17,10 @@ public interface LocusWalker<MapType, ReduceType> {
|
|||
public String walkerType();
|
||||
|
||||
// Do we actually want to operate on the context?
|
||||
boolean filter(char ref, LocusIterator context);
|
||||
boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context);
|
||||
|
||||
// Map over the edu.mit.broad.sting.atk.LocusContext
|
||||
MapType map(char ref, LocusIterator context);
|
||||
MapType map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context);
|
||||
|
||||
// Given result of map function
|
||||
ReduceType reduceInit();
|
||||
|
|
|
|||
|
|
@ -9,13 +9,21 @@ import edu.mit.broad.picard.filter.FilteringIterator;
|
|||
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
|
||||
import edu.mit.broad.sting.utils.ReferenceIterator;
|
||||
import edu.mit.broad.sting.utils.ReferenceOrderedData;
|
||||
import edu.mit.broad.sting.utils.ReferenceOrderedDatum;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.List;
|
||||
import java.util.Iterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class TraversalEngine {
|
||||
// Usage and parameters
|
||||
private File readsFile = null;
|
||||
private File refFileName = null;
|
||||
private List<ReferenceOrderedData> rods = null;
|
||||
|
||||
private String regionStr = null;
|
||||
private String traversalType = null;
|
||||
private ValidationStringency strictness = ValidationStringency.STRICT;
|
||||
|
|
@ -42,9 +50,10 @@ public class TraversalEngine {
|
|||
// Setting up the engine
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public TraversalEngine(File reads, File ref) {
|
||||
public TraversalEngine(File reads, File ref, ReferenceOrderedData[] rods ) {
|
||||
readsFile = reads;
|
||||
refFileName = ref;
|
||||
this.rods = Arrays.asList(rods);
|
||||
}
|
||||
|
||||
public void setRegion(final String reg) { regionStr = regionStr; }
|
||||
|
|
@ -100,6 +109,30 @@ public class TraversalEngine {
|
|||
System.exit(1);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// dealing with reference ordered data
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
protected List<ReferenceOrderedData.RODIterator> initializeRODs() {
|
||||
// set up reference ordered data
|
||||
List<ReferenceOrderedData.RODIterator> rodIters = new ArrayList<ReferenceOrderedData.RODIterator>();
|
||||
for ( ReferenceOrderedData data : rods ) {
|
||||
rodIters.add(data.iterator());
|
||||
}
|
||||
return rodIters;
|
||||
}
|
||||
|
||||
protected List<ReferenceOrderedDatum> getReferenceOrderedDataAtLocus(List<ReferenceOrderedData.RODIterator> rodIters,
|
||||
final String contig, final int pos) {
|
||||
List<ReferenceOrderedDatum> data = new ArrayList<ReferenceOrderedDatum>();
|
||||
for ( ReferenceOrderedData.RODIterator iter : rodIters ) {
|
||||
data.add(iter.seekForward(contig, pos));
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// traversal functions
|
||||
|
|
@ -159,6 +192,8 @@ public class TraversalEngine {
|
|||
FilteringIterator filterIter = new FilteringIterator(readStream.iterator(), new locusStreamFilterFunc());
|
||||
CloseableIterator<LocusIterator> iter = new LocusIterator(filterIter);
|
||||
|
||||
List<ReferenceOrderedData.RODIterator> rodIters = initializeRODs();
|
||||
|
||||
T sum = walker.reduceInit();
|
||||
while ( iter.hasNext() ) {
|
||||
this.nRecords++;
|
||||
|
|
@ -167,13 +202,14 @@ public class TraversalEngine {
|
|||
final LocusIterator locus = iter.next();
|
||||
final ReferenceIterator refSite = refIter.seekForward(locus.getContig(), locus.getPosition());
|
||||
final char refBase = refSite.getBaseAsChar();
|
||||
final List<ReferenceOrderedDatum> rodData = getReferenceOrderedDataAtLocus(rodIters, locus.getContig(), locus.getPosition());
|
||||
|
||||
if ( DEBUGGING )
|
||||
System.out.printf(" Reference: %s:%d %c%n", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase);
|
||||
|
||||
final boolean keepMeP = walker.filter(refBase, locus);
|
||||
final boolean keepMeP = walker.filter(rodData, refBase, locus);
|
||||
if ( keepMeP ) {
|
||||
M x = walker.map(refBase, locus);
|
||||
M x = walker.map(rodData, refBase, locus);
|
||||
sum = walker.reduce(x, sum);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,8 +2,11 @@ package edu.mit.broad.sting.atk.modules;
|
|||
|
||||
import edu.mit.broad.sting.atk.LocusWalker;
|
||||
import edu.mit.broad.sting.atk.LocusIterator;
|
||||
import edu.mit.broad.sting.utils.ReferenceOrderedDatum;
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
|
|
@ -18,12 +21,12 @@ public class EmptyLocusWalker implements LocusWalker<Integer, Integer> {
|
|||
public String walkerType() { return "ByLocus"; }
|
||||
|
||||
// Do we actually want to operate on the context?
|
||||
public boolean filter(char ref, LocusIterator context) {
|
||||
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
|
||||
return true; // We are keeping all the reads
|
||||
}
|
||||
|
||||
// Map over the edu.mit.broad.sting.atk.LocusContext
|
||||
public Integer map(char ref, LocusIterator context) {
|
||||
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package edu.mit.broad.sting.atk.modules;
|
|||
|
||||
import edu.mit.broad.sting.atk.LocusWalker;
|
||||
import edu.mit.broad.sting.atk.LocusIterator;
|
||||
import edu.mit.broad.sting.utils.ReferenceOrderedDatum;
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
|
||||
import java.util.List;
|
||||
|
|
@ -20,12 +21,12 @@ public class PileupWalker implements LocusWalker<Integer, Integer> {
|
|||
public String walkerType() { return "ByLocus"; }
|
||||
|
||||
// Do we actually want to operate on the context?
|
||||
public boolean filter(char ref, LocusIterator context) {
|
||||
public boolean filter(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
|
||||
return true; // We are keeping all the reads
|
||||
}
|
||||
|
||||
// Map over the edu.mit.broad.sting.atk.LocusContext
|
||||
public Integer map(char ref, LocusIterator context) {
|
||||
public Integer map(List<ReferenceOrderedDatum> rodData, char ref, LocusIterator context) {
|
||||
//System.out.printf("Reads %s:%d %d%n", context.getContig(), context.getPosition(), context.getReads().size());
|
||||
//for ( SAMRecord read : context.getReads() ) {
|
||||
// System.out.println(" -> " + read.getReadName());
|
||||
|
|
@ -49,8 +50,18 @@ public class PileupWalker implements LocusWalker<Integer, Integer> {
|
|||
//System.out.printf(" [%2d] [%s] %s%n", offset, read.getReadString().charAt(offset), read.getReadString());
|
||||
}
|
||||
|
||||
if ( context.getPosition() % 10 == 0 )
|
||||
System.out.printf("%s:%d: %s %s %s%n", context.getContig(), context.getPosition(), ref, bases, quals);
|
||||
String rodString = "";
|
||||
for ( ReferenceOrderedDatum datum : rodData ) {
|
||||
if ( datum != null ) {
|
||||
rodString += datum.toSimpleString();
|
||||
}
|
||||
}
|
||||
if ( rodString != "" )
|
||||
rodString = "[ROD: " + rodString + "]";
|
||||
|
||||
if ( context.getPosition() % 1 == 0 ) {
|
||||
System.out.printf("%s:%d: %s %s %s %s%n", context.getContig(), context.getPosition(), ref, bases, quals, rodString);
|
||||
}
|
||||
|
||||
//for ( int offset : context.getOffsets() ) {
|
||||
// System.out.println(" -> " + read.getReadName());
|
||||
|
|
|
|||
|
|
@ -1,15 +1,7 @@
|
|||
package edu.mit.broad.sting.utils;
|
||||
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
import edu.mit.broad.sam.util.CloseableIterator;
|
||||
import edu.mit.broad.picard.util.TabbedTextFileParser;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.util.Iterator;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Class for representing arbitrary reference ordered data sets
|
||||
|
|
@ -19,40 +11,17 @@ import java.util.HashMap;
|
|||
* Time: 10:47:14 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ReferenceOrderedData implements Iterable<ReferenceOrderedDatum> {
|
||||
public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements Iterable<ROD> {
|
||||
private File file = null;
|
||||
private Class<ROD> type = null; // runtime type information for object construction
|
||||
|
||||
public ReferenceOrderedData(File file) {
|
||||
public ReferenceOrderedData(File file, Class<ROD> type ) {
|
||||
this.file = file;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Iteration
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
private class RODIterator implements Iterator<ReferenceOrderedDatum> {
|
||||
TabbedTextFileParser parser = null;
|
||||
public RODIterator() {
|
||||
parser = new TabbedTextFileParser(true, file);
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return parser.hasNext();
|
||||
}
|
||||
|
||||
public ReferenceOrderedDatum next() {
|
||||
String parts[] = parser.next();
|
||||
return parseGFFLine(parts);
|
||||
}
|
||||
|
||||
public void remove () {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public RODIterator iterator() {
|
||||
return new RODIterator();
|
||||
return new RODIterator(new SimpleRODIterator());
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
|
@ -66,28 +35,112 @@ public class ReferenceOrderedData implements Iterable<ReferenceOrderedDatum> {
|
|||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Iteration
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
private class SimpleRODIterator implements Iterator<ROD> {
|
||||
private WhitespaceTextFileParser parser = null;
|
||||
|
||||
public SimpleRODIterator() {
|
||||
parser = new WhitespaceTextFileParser(true, file);
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return parser.hasNext();
|
||||
}
|
||||
|
||||
public ROD next() {
|
||||
String parts[] = parser.next();
|
||||
return parseLine(parts);
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
public class RODIterator implements Iterator<ROD> {
|
||||
private PushbackIterator<ROD> it;
|
||||
private ROD prev = null;
|
||||
|
||||
public RODIterator(SimpleRODIterator it) {
|
||||
this.it = new PushbackIterator<ROD>(it);
|
||||
}
|
||||
|
||||
public boolean hasNext() { return it.hasNext(); }
|
||||
public ROD next() {
|
||||
prev = it.next();
|
||||
return prev;
|
||||
}
|
||||
|
||||
//
|
||||
// Seeks forward in the file until we reach (or cross) a record at contig / pos
|
||||
// If we don't find anything and cross beyond contig / pos, we return null
|
||||
// Otherwise we return the first object who's start is at pos
|
||||
//
|
||||
public ROD seekForward(final String contigName, final int pos) {
|
||||
ROD result = null;
|
||||
|
||||
//System.out.printf(" *** starting seek to %s %d %s%n", contigName, pos, prev);
|
||||
while ( hasNext() ) {
|
||||
ROD current = next();
|
||||
//System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart());
|
||||
int strCmp = contigName.compareTo( prev.getContig() );
|
||||
if ( strCmp == 0 ) {
|
||||
// The contigs are equal
|
||||
if ( current.getStart() > pos ) {
|
||||
// There was nothing to find, push back next and return null
|
||||
it.pushback(current);
|
||||
break;
|
||||
}
|
||||
else if ( pos == current.getStart() ) {
|
||||
// We found a record at contig / pos, return it
|
||||
result = current;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if ( strCmp < 0 ) {
|
||||
// We've gone past the desired contig, break
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
if ( result == null )
|
||||
;
|
||||
//System.out.printf(" --- seek result to %s %d is NULL%n", contigName, pos);
|
||||
else
|
||||
System.out.printf(" ### Found %s %d%n", result.getContig(), result.getStart());
|
||||
*/
|
||||
|
||||
// we ran out of elements or found something
|
||||
return result;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Parsing
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
ReferenceOrderedDatum parseGFFLine(final String[] parts) {
|
||||
ROD parseLine(final String[] parts) {
|
||||
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
|
||||
|
||||
final String contig = parts[0];
|
||||
final String source = parts[1];
|
||||
final String feature = parts[2];
|
||||
final long start = Long.parseLong(parts[3]);
|
||||
final long stop = Long.parseLong(parts[4]);
|
||||
|
||||
double score = Double.NaN;
|
||||
if ( ! parts[5].equals(".") )
|
||||
score = Double.parseDouble(parts[5]);
|
||||
|
||||
final String strand = parts[6];
|
||||
final String frame = parts[7];
|
||||
HashMap<String, String> attributes = null;
|
||||
return new ReferenceOrderedDatum(contig, source, feature, start, stop, score, strand, frame, attributes);
|
||||
try {
|
||||
ROD obj = type.newInstance();
|
||||
obj.parseLine(parts);
|
||||
return obj;
|
||||
} catch ( java.lang.InstantiationException e ) {
|
||||
System.out.println(e);
|
||||
return null; // wow, unsafe!
|
||||
} catch ( java.lang.IllegalAccessException e ) {
|
||||
System.out.println(e);
|
||||
return null; // wow, unsafe!
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,5 @@
|
|||
package edu.mit.broad.sting.utils;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
|
|
@ -9,85 +7,15 @@ import java.util.HashMap;
|
|||
* Time: 10:49:47 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ReferenceOrderedDatum {
|
||||
private String contig, source, feature, strand, frame;
|
||||
private long start, stop;
|
||||
private double score;
|
||||
private HashMap<String, String> attributes;
|
||||
public abstract class ReferenceOrderedDatum {
|
||||
public ReferenceOrderedDatum() { }
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Constructors
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public ReferenceOrderedDatum(final String contig, final String source, final String feature,
|
||||
final long start, final long stop, final double score,
|
||||
final String strand, final String frame, HashMap<String, String> attributes) {
|
||||
this.contig = contig;
|
||||
this.source = source;
|
||||
this.feature = feature;
|
||||
this.start = start;
|
||||
this.stop= stop;
|
||||
this.score = score;
|
||||
this.strand = strand;
|
||||
this.frame = frame;
|
||||
this.attributes = attributes;
|
||||
}
|
||||
public abstract void parseLine(final String[] parts);
|
||||
|
||||
// public ReferenceOrderedDatum(final String contig, final String source, final String feature,
|
||||
// final long start, final long stop, final double score,
|
||||
// final String strand, final String frame) {
|
||||
// ReferenceOrderedDatum(contig, source, feature, start, stop, score, strand, frame, null);
|
||||
// }
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Accessors
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public String getContig() {
|
||||
return this.contig;
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public String getFeature() {
|
||||
return feature;
|
||||
}
|
||||
|
||||
public String getStrand() {
|
||||
return strand;
|
||||
}
|
||||
|
||||
public String getFrame() {
|
||||
return frame;
|
||||
}
|
||||
|
||||
public long getStart() {
|
||||
return start;
|
||||
}
|
||||
|
||||
public long getStop() {
|
||||
return stop;
|
||||
}
|
||||
|
||||
public double getScore() {
|
||||
return score;
|
||||
}
|
||||
|
||||
public String getAttribute(final String key) {
|
||||
return attributes.get(key);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// formatting
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public String toString() {
|
||||
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s", contig, source, feature, start, stop, score, strand, frame);
|
||||
}
|
||||
public abstract String toString();
|
||||
public abstract String toSimpleString();
|
||||
|
||||
public abstract String getContig();
|
||||
public abstract long getStart();
|
||||
public abstract long getStop();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,43 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.sting.utils;
|
||||
|
||||
import edu.mit.broad.picard.util.BasicTextFileParser;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Parser for tab-delimited files
|
||||
*
|
||||
* @author Kathleen Tibbetts
|
||||
*/
|
||||
public class WhitespaceTextFileParser extends BasicTextFileParser {
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param file The file to parse
|
||||
*/
|
||||
public WhitespaceTextFileParser(boolean treatGroupedDelimitersAsOne, File... file) {
|
||||
super(treatGroupedDelimitersAsOne, file);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether a given character is a delimiter
|
||||
*
|
||||
* @param b the character to evaluate
|
||||
* @return true if <code>b</code> is a delimiter; otherwise false
|
||||
*/
|
||||
protected boolean isDelimiter(byte b) {
|
||||
return Character.isWhitespace((char)b);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,102 @@
|
|||
package edu.mit.broad.sting.utils;
|
||||
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
import edu.mit.broad.sam.util.CloseableIterator;
|
||||
import edu.mit.broad.picard.util.TabbedTextFileParser;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.util.Iterator;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Example format:
|
||||
* 585 chr1 433 433 rs56289060 0 + - - -/C genomic insertion unknown 0 0 unknown between 1
|
||||
* 585 chr1 491 492 rs55998931 0 + C C C/T genomic single unknown 0 0 unknown exact 1
|
||||
*
|
||||
* User: mdepristo
|
||||
* Date: Feb 27, 2009
|
||||
* Time: 10:47:14 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class rodDbSNP extends ReferenceOrderedDatum {
|
||||
public String contig; // Reference sequence chromosome or scaffold
|
||||
public long start, stop; // Start and stop positions in chrom
|
||||
|
||||
public String name; // Reference SNP identifier or Affy SNP name
|
||||
public String strand; // Which DNA strand contains the observed alleles
|
||||
public String observed; // The sequences of the observed alleles from rs-fasta files
|
||||
public char[] observedBases; // The sequences of the observed alleles from rs-fasta files
|
||||
public String molType; // Sample type from exemplar ss
|
||||
public String varType; // The class of variant (simple, insertion, deletion, range, etc.)
|
||||
// Can be 'unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion'
|
||||
public String validationStatus; // The validation status of the SNP
|
||||
// one of set('unknown','by-cluster','by-frequency','by-submitter','by-2hit-2allele','by-hapmap')
|
||||
|
||||
public double avHet; // The average heterozygosity from all observations
|
||||
public double avHetSE; // The Standard Error for the average heterozygosity
|
||||
|
||||
public String func; // The functional category of the SNP (coding-synon, coding-nonsynon, intron, etc.)
|
||||
// set('unknown','coding-synon','intron','cds-reference','near-gene-3','near-gene-5',
|
||||
// 'nonsense','missense','frameshift','untranslated-3','untranslated-5','splice-3','splice-5')
|
||||
public String locType; // How the variant affects the reference sequence
|
||||
// enum('range','exact','between','rangeInsertion','rangeSubstitution','rangeDeletion')
|
||||
|
||||
public int weight; // The quality of the alignment
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Constructors
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public rodDbSNP() {}
|
||||
|
||||
public String getContig() { return this.contig; }
|
||||
public long getStart() { return start; }
|
||||
public long getStop() { return stop; }
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// formatting
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public String toString() {
|
||||
return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
|
||||
contig, start, stop, name, strand, observed, molType,
|
||||
varType, validationStatus, avHet, avHetSE, func, locType, weight );
|
||||
}
|
||||
|
||||
public String toSimpleString() {
|
||||
return String.format("%s:%s", name, observed);
|
||||
}
|
||||
|
||||
public void parseLine(final String[] parts) {
|
||||
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" <=> ", parts));
|
||||
contig = parts[1];
|
||||
start = Long.parseLong(parts[2]) + 1; // The final is 0 based
|
||||
stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
|
||||
name = parts[4];
|
||||
strand = parts[5];
|
||||
observed = parts[9];
|
||||
molType = parts[10];
|
||||
varType = parts[11];
|
||||
validationStatus = parts[12];
|
||||
avHet = Double.parseDouble(parts[13]);
|
||||
avHetSE = Double.parseDouble(parts[14]);
|
||||
func = parts[15];
|
||||
locType = parts[16];
|
||||
weight = Integer.parseInt(parts[17]);
|
||||
|
||||
// Cut up the observed bases string into an array of individual bases
|
||||
String[] bases = observed.split("/");
|
||||
observedBases = new char[bases.length];
|
||||
for ( String elt : bases ) {
|
||||
observedBases[0] = (char)elt.getBytes()[0];
|
||||
//System.out.printf(" Bases %s %d %c%n", elt, elt.getBytes()[0], (char)elt.getBytes()[0]);
|
||||
}
|
||||
//System.out.printf(" => Observed bases are %s%n", Utils.join(" B ", bases));
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,123 @@
|
|||
package edu.mit.broad.sting.utils;
|
||||
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
import edu.mit.broad.sam.util.CloseableIterator;
|
||||
import edu.mit.broad.picard.util.TabbedTextFileParser;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.util.Iterator;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Class for representing arbitrary reference ordered data sets
|
||||
*
|
||||
* User: mdepristo
|
||||
* Date: Feb 27, 2009
|
||||
* Time: 10:47:14 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class rodGFF extends ReferenceOrderedDatum {
|
||||
private String contig, source, feature, strand, frame;
|
||||
private long start, stop;
|
||||
private double score;
|
||||
private HashMap<String, String> attributes;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Constructors
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public rodGFF() {
|
||||
|
||||
}
|
||||
|
||||
public void setValues(final String contig, final String source, final String feature,
|
||||
final long start, final long stop, final double score,
|
||||
final String strand, final String frame, HashMap<String, String> attributes) {
|
||||
this.contig = contig;
|
||||
this.source = source;
|
||||
this.feature = feature;
|
||||
this.start = start;
|
||||
this.stop= stop;
|
||||
this.score = score;
|
||||
this.strand = strand;
|
||||
this.frame = frame;
|
||||
this.attributes = attributes;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Accessors
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public String getContig() {
|
||||
return this.contig;
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public String getFeature() {
|
||||
return feature;
|
||||
}
|
||||
|
||||
public String getStrand() {
|
||||
return strand;
|
||||
}
|
||||
|
||||
public String getFrame() {
|
||||
return frame;
|
||||
}
|
||||
|
||||
public long getStart() {
|
||||
return start;
|
||||
}
|
||||
|
||||
public long getStop() {
|
||||
return stop;
|
||||
}
|
||||
|
||||
public double getScore() {
|
||||
return score;
|
||||
}
|
||||
|
||||
public String getAttribute(final String key) {
|
||||
return attributes.get(key);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// formatting
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public String toString() {
|
||||
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s", contig, source, feature, start, stop, score, strand, frame);
|
||||
}
|
||||
|
||||
public String toSimpleString() {
|
||||
return String.format("%s", feature);
|
||||
}
|
||||
|
||||
public void parseLine(final String[] parts) {
|
||||
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
|
||||
|
||||
final String contig = parts[0];
|
||||
final String source = parts[1];
|
||||
final String feature = parts[2];
|
||||
final long start = Long.parseLong(parts[3]);
|
||||
final long stop = Long.parseLong(parts[4]);
|
||||
|
||||
double score = Double.NaN;
|
||||
if ( ! parts[5].equals(".") )
|
||||
score = Double.parseDouble(parts[5]);
|
||||
|
||||
final String strand = parts[6];
|
||||
final String frame = parts[7];
|
||||
HashMap<String, String> attributes = null;
|
||||
setValues(contig, source, feature, start, stop, score, strand, frame, attributes);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue