From 01e950ef2f1153fb19bde2ed31cd4721249492a9 Mon Sep 17 00:00:00 2001 From: depristo Date: Sun, 15 Mar 2009 22:37:20 +0000 Subject: [PATCH] Reference ordered data files git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@55 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/refdata/ReferenceOrderedData.java | 213 ++++++++++++++++++ .../gatk/refdata/ReferenceOrderedDatum.java | 30 +++ .../sting/gatk/refdata/rodDbSNP.java | 157 +++++++++++++ .../sting/gatk/refdata/rodGFF.java | 113 ++++++++++ 4 files changed, 513 insertions(+) create mode 100644 playground/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java create mode 100644 playground/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java create mode 100644 playground/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java create mode 100644 playground/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java diff --git a/playground/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/playground/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java new file mode 100644 index 000000000..2263ecfed --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -0,0 +1,213 @@ +package org.broadinstitute.sting.gatk.refdata; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Iterator; +import java.util.ArrayList; +import java.util.Collections; + +import edu.mit.broad.picard.util.TabbedTextFileParser; +import org.broadinstitute.sting.gatk.iterators.PushbackIterator; +import org.broadinstitute.sting.utils.GenomeLoc; + +/** + * Class for representing arbitrary reference ordered data sets + * + * User: mdepristo + * Date: Feb 27, 2009 + * Time: 10:47:14 AM + * To change this template use File | Settings | File Templates. + */ +public class ReferenceOrderedData implements Iterable { + private File file = null; + private Class type = null; // runtime type information for object construction + + public ReferenceOrderedData(File file, Class type ) { + this.file = file; + this.type = type; + } + + public RODIterator iterator() { + return new RODIterator(new SimpleRODIterator()); + } + + // ---------------------------------------------------------------------- + // + // Testing + // + // ---------------------------------------------------------------------- + public void testMe() { + ReferenceOrderedDatum last = null; + for ( ReferenceOrderedDatum rec : this ) { + if ( last == null || ! last.getLocation().onSameContig(rec.getLocation()) ) { + System.out.println(rec.toString()); + } + last = rec; + } + System.exit(1); + } + + // ---------------------------------------------------------------------- + // + // Manipulations of all of the data + // + // ---------------------------------------------------------------------- + public ArrayList readAll() { + ArrayList elts = new ArrayList(); + for ( ReferenceOrderedDatum rec : this ) { + elts.add(rec); + } + elts.trimToSize(); + return elts; + } + + public static void sortRODDataInMemory(ArrayList data) { + Collections.sort(data); + } + + public static void write(ArrayList data, File output) throws IOException { + final FileWriter out = new FileWriter(output); + + for ( ReferenceOrderedDatum rec : data ) { + out.write(rec.repl() + "\n"); + } + + out.close(); + } + + public boolean validateFile() throws Exception { + ReferenceOrderedDatum last = null; + for ( ReferenceOrderedDatum rec : this ) { + if ( last != null && last.compareTo(rec) == 1 ) { + // It's out of order + throw new Exception("Out of order elements at \n" + last.toString() + "\n" + rec.toString()); + } + last = rec; + } + return true; + } + + public void indexFile() { + // Fixme -- get access to the linear index system from Jim + } + + // ---------------------------------------------------------------------- + // + // Iteration + // + // ---------------------------------------------------------------------- + private class SimpleRODIterator implements Iterator { + //private WhitespaceTextFileParser parser = null; + private TabbedTextFileParser parser = null; + + public SimpleRODIterator() { + parser = new TabbedTextFileParser(true, file); + } + + public boolean hasNext() { + return parser.hasNext(); + } + + public ROD next() { + String parts[] = parser.next(); + return parseLine(parts); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + + public class RODIterator implements Iterator { + private PushbackIterator it; + private ROD prev = null; + + public RODIterator(SimpleRODIterator it) { + this.it = new PushbackIterator(it); + } + + public boolean hasNext() { return it.hasNext(); } + public ROD next() { + prev = it.next(); + return prev; + } + + // + // Seeks forward in the file until we reach (or cross) a record at contig / pos + // If we don't find anything and cross beyond contig / pos, we return null + // Otherwise we return the first object who's start is at pos + // + public ROD seekForward(final GenomeLoc loc) { + return seekForward(loc.getContig(), loc.getStart()); + } + + protected ROD seekForward(final String contigName, final long pos) { + final boolean DEBUG = false; + + ROD result = null; + + if ( DEBUG ) System.out.printf(" *** starting seek to %s %d %s%n", contigName, pos, prev); + while ( hasNext() ) { + ROD current = next(); + //System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart()); + int strCmp = GenomeLoc.compareContigs( contigName, prev.getContig() );// contigName.compareTo( prev.getContig() ); + if ( strCmp == 0 ) { + // The contigs are equal + if ( current.getStart() > pos ) { + // There was nothing to find, push back next and return null + it.pushback(current); + break; + } + else if ( pos == current.getStart() ) { + // We found a record at contig / pos, return it + result = current; + break; + } + } + else if ( strCmp < 0 ) { + if ( DEBUG ) System.out.printf(" -> Jumped to contig %s%n", contigName); + // We've gone past the desired contig, break + it.pushback(current); + break; + } + } + + if ( DEBUG ) { + if ( result == null ) + ; + //System.out.printf(" --- seek result to %s %d is NULL%n", contigName, pos); + else + System.out.printf(" ### Found %s %d%n", result.getContig(), result.getStart()); + } + + + // we ran out of elements or found something + return result; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + + // ---------------------------------------------------------------------- + // + // Parsing + // + // ---------------------------------------------------------------------- + ROD parseLine(final String[] parts) { + //System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts)); + try { + ROD obj = type.newInstance(); + obj.parseLine(parts); + return obj; + } catch ( java.lang.InstantiationException e ) { + System.out.println(e); + return null; // wow, unsafe! + } catch ( java.lang.IllegalAccessException e ) { + System.out.println(e); + return null; // wow, unsafe! + } + } +} diff --git a/playground/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java b/playground/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java new file mode 100644 index 000000000..51b1379b5 --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.gatk.refdata; + +import org.broadinstitute.sting.utils.GenomeLoc; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 27, 2009 + * Time: 10:49:47 AM + * To change this template use File | Settings | File Templates. + */ +public abstract class ReferenceOrderedDatum implements Comparable { + + public ReferenceOrderedDatum() { } + + public abstract void parseLine(final String[] parts); + + public abstract String toString(); + public abstract String toSimpleString(); + public abstract String repl(); + + public abstract GenomeLoc getLocation(); + public int compareTo( ReferenceOrderedDatum that ) { + return getLocation().compareTo(that.getLocation()); + } + + public final String getContig() { return getLocation().getContig(); } + public final long getStart() { return getLocation().getStart(); } + public final long getStop() { return getLocation().getStop(); } +} diff --git a/playground/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java b/playground/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java new file mode 100644 index 000000000..05c0244ba --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java @@ -0,0 +1,157 @@ +package org.broadinstitute.sting.gatk.refdata; + +import edu.mit.broad.picard.util.SequenceUtil; + +import java.util.*; + +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; + +/** + * Example format: + * 585 chr1 433 433 rs56289060 0 + - - -/C genomic insertion unknown 0 0 unknown between 1 + * 585 chr1 491 492 rs55998931 0 + C C C/T genomic single unknown 0 0 unknown exact 1 + * + * User: mdepristo + * Date: Feb 27, 2009 + * Time: 10:47:14 AM + * To change this template use File | Settings | File Templates. + */ +public class rodDbSNP extends ReferenceOrderedDatum { + public GenomeLoc loc; // genome location of SNP + // Reference sequence chromosome or scaffold + // Start and stop positions in chrom + + public String name; // Reference SNP identifier or Affy SNP name + public String strand; // Which DNA strand contains the observed alleles + + public String refBases; // the reference base according to NCBI, in the dbSNP file + public String observed; // The sequences of the observed alleles from rs-fasta files + + public String molType; // Sample type from exemplar ss + public String varType; // The class of variant (simple, insertion, deletion, range, etc.) + // Can be 'unknown','single','in-del','het','microsatellite','named','mixed','mnp','insertion','deletion' + public String validationStatus; // The validation status of the SNP + // one of set('unknown','by-cluster','by-frequency','by-submitter','by-2hit-2allele','by-hapmap') + + public double avHet; // The average heterozygosity from all observations + public double avHetSE; // The Standard Error for the average heterozygosity + + public String func; // The functional category of the SNP (coding-synon, coding-nonsynon, intron, etc.) + // set('unknown','coding-synon','intron','cds-reference','near-gene-3','near-gene-5', + // 'nonsense','missense','frameshift','untranslated-3','untranslated-5','splice-3','splice-5') + public String locType; // How the variant affects the reference sequence + // enum('range','exact','between','rangeInsertion','rangeSubstitution','rangeDeletion') + + public int weight; // The quality of the alignment + + // ---------------------------------------------------------------------- + // + // Constructors + // + // ---------------------------------------------------------------------- + public rodDbSNP() {} + + // ---------------------------------------------------------------------- + // + // manipulating the SNP information + // + // ---------------------------------------------------------------------- + public GenomeLoc getLocation() { return loc; } + + public boolean onFwdStrand() { + return strand.equals("+"); + } + + // Get the reference bases on the forward strand + public String getRefBasesFWD() { + if ( onFwdStrand() ) + return refBases; + else + return SequenceUtil.reverseComplement(refBases); + } + + public List getAllelesFWD() { + List alleles = null; + if ( onFwdStrand() ) + alleles = Arrays.asList(observed.split("/")); + else + alleles = Arrays.asList(SequenceUtil.reverseComplement(observed).split("/")); + + //System.out.printf("getAlleles %s on %s %b => %s %n", observed, strand, onFwdStrand(), Utils.join("/", alleles)); + return alleles; + } + + public String getAllelesFWDString() { + return Utils.join("/", getAllelesFWD()); + } + + // ---------------------------------------------------------------------- + // + // What kind of variant are we? + // + // ---------------------------------------------------------------------- + public boolean isSNP() { return varType.contains("single"); } + public boolean isInsertion() { return varType.contains("insertion"); } + public boolean isDeletion() { return varType.contains("deletion"); } + public boolean isIndel() { return varType.contains("in-del"); } + + public boolean isHapmap() { return validationStatus.contains("by-hapmap"); } + public boolean is2Hit2Allele() { return validationStatus.contains("by-2hit-2allele"); } + + // ---------------------------------------------------------------------- + // + // formatting + // + // ---------------------------------------------------------------------- + public String toString() { + return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", + getContig(), getStart(), getStop(), name, strand, refBases, observed, molType, + varType, validationStatus, avHet, avHetSE, func, locType, weight ); + } + + public String toSimpleString() { + return String.format("%s:%s:%s", name, observed, strand); + } + + public String toMediumString() { + String s = String.format("%s:%s:%s", getLocation().toString(), name, getAllelesFWDString()); + if ( isSNP() ) s += ":SNP"; + if ( isIndel() ) s += ":Indel"; + if ( isHapmap() ) s += ":Hapmap"; + if ( is2Hit2Allele() ) s += ":2Hit"; + return s; + } + + public String repl() { + return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", + 585, getContig(), getStart()-1, getStop()-1, name, strand, refBases, refBases, observed, molType, + varType, validationStatus, avHet, avHetSE, func, locType, weight ); + } + + public void parseLine(final String[] parts) { + try { + String contig = parts[1]; + long start = Long.parseLong(parts[2]) + 1; // The final is 0 based + long stop = Long.parseLong(parts[3]) + 1; // The final is 0 based + loc = new GenomeLoc(contig, start, stop); + + name = parts[4]; + refBases = parts[5]; + strand = parts[6]; + observed = parts[9]; + molType = parts[10]; + varType = parts[11]; + validationStatus = parts[12]; + avHet = Double.parseDouble(parts[13]); + avHetSE = Double.parseDouble(parts[14]); + func = parts[15]; + locType = parts[16]; + weight = Integer.parseInt(parts[17]); + } catch ( RuntimeException e ) { + System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts)); + throw e; + } + } +} diff --git a/playground/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java b/playground/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java new file mode 100644 index 000000000..a82911b57 --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java @@ -0,0 +1,113 @@ +package org.broadinstitute.sting.gatk.refdata; + +import java.util.HashMap; + +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.utils.GenomeLoc; + +/** + * Class for representing arbitrary reference ordered data sets + * + * User: mdepristo + * Date: Feb 27, 2009 + * Time: 10:47:14 AM + * To change this template use File | Settings | File Templates. + */ +public class rodGFF extends ReferenceOrderedDatum { + private String contig, source, feature, strand, frame; + private long start, stop; + private double score; + private HashMap attributes; + + // ---------------------------------------------------------------------- + // + // Constructors + // + // ---------------------------------------------------------------------- + public rodGFF() { + + } + + public void setValues(final String contig, final String source, final String feature, + final long start, final long stop, final double score, + final String strand, final String frame, HashMap attributes) { + this.contig = contig; + this.source = source; + this.feature = feature; + this.start = start; + this.stop= stop; + this.score = score; + this.strand = strand; + this.frame = frame; + this.attributes = attributes; + } + + // ---------------------------------------------------------------------- + // + // Accessors + // + // ---------------------------------------------------------------------- + public String getSource() { + return source; + } + + public String getFeature() { + return feature; + } + + public String getStrand() { + return strand; + } + + public String getFrame() { + return frame; + } + + public double getScore() { + return score; + } + + public GenomeLoc getLocation() { + return new GenomeLoc(contig, start, stop); + } + + public String getAttribute(final String key) { + return attributes.get(key); + } + + // ---------------------------------------------------------------------- + // + // formatting + // + // ---------------------------------------------------------------------- + public String toString() { + return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s", contig, source, feature, start, stop, score, strand, frame); + } + + public String repl() { + return this.toString(); + } + + public String toSimpleString() { + return String.format("%s", feature); + } + + public void parseLine(final String[] parts) { + //System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts)); + + final String contig = parts[0]; + final String source = parts[1]; + final String feature = parts[2]; + final long start = Long.parseLong(parts[3]); + final long stop = Long.parseLong(parts[4]); + + double score = Double.NaN; + if ( ! parts[5].equals(".") ) + score = Double.parseDouble(parts[5]); + + final String strand = parts[6]; + final String frame = parts[7]; + HashMap attributes = null; + setValues(contig, source, feature, start, stop, score, strand, frame, attributes); + } +}