From 9ecb3e00159a129156e0cbdfeb1b731b9e40cd6c Mon Sep 17 00:00:00 2001 From: aaron Date: Wed, 15 Jul 2009 15:30:19 +0000 Subject: [PATCH] adding GLFRods with tests and some other code changes git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1251 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/refdata/RodGLF.java | 127 ++++++++++++++++++ .../gatk/walkers/SingleSampleGenotyper.java | 4 + .../sting/utils/genotype/glf/GLFReader.java | 50 +++++-- .../sting/utils/genotype/glf/GLFRecord.java | 23 +++- .../utils/genotype/glf/SinglePointCall.java | 7 +- .../genotype/glf/VariableLengthCall.java | 31 ++++- .../sting/gatk/refdata/RodGLFTest.java | 107 +++++++++++++++ .../utils/genotype/glf/GLFReaderTest.java | 51 +++++++ 8 files changed, 385 insertions(+), 15 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/refdata/RodGLF.java create mode 100644 java/test/org/broadinstitute/sting/gatk/refdata/RodGLFTest.java create mode 100644 java/test/org/broadinstitute/sting/utils/genotype/glf/GLFReaderTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RodGLF.java b/java/src/org/broadinstitute/sting/gatk/refdata/RodGLF.java new file mode 100644 index 000000000..c9a031870 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RodGLF.java @@ -0,0 +1,127 @@ +package org.broadinstitute.sting.gatk.refdata; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.genotype.glf.GLFReader; +import org.broadinstitute.sting.utils.genotype.glf.GLFRecord; +import org.broadinstitute.sting.utils.genotype.glf.SinglePointCall; +import org.broadinstitute.sting.utils.genotype.glf.VariableLengthCall; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; + + +/** + * @author aaron + *

+ * Class RodGLF + *

+ * the rod class for GLF data. + */ +public class RodGLF extends BasicReferenceOrderedDatum { + protected GLFRecord mRecord; + private GenomeLoc mLoc; + private static GLFRODIterator mWrap; + private String contig; + private int contigLength; + + public RodGLF(String name) { + super(name); + } + + @Override + public boolean parseLine(Object header, String[] parts) throws IOException { + return false; //To change body of implemented methods use File | Settings | File Templates. + } + + @Override + public String toString() { + final StringBuilder builder = new StringBuilder(); + builder.append(contig); + builder.append("\t"); + builder.append(mRecord.getOffset()); + builder.append("\t"); + builder.append(mRecord.getRefBase()); + builder.append("\t"); + builder.append(mRecord.getReadDepth()); + builder.append("\t"); + builder.append(mRecord.getRmsMapQ()); + builder.append("\t"); + builder.append(mRecord.getMinimumLikelihood()); + + if (mRecord.getRecordType() == GLFRecord.RECORD_TYPE.SINGLE) { + for (double d : ((SinglePointCall) mRecord).getLikelihoods()) { + builder.append("\t"); + builder.append(d); + } + } else if (mRecord.getRecordType() == GLFRecord.RECORD_TYPE.VARIABLE) { + VariableLengthCall call = (VariableLengthCall) mRecord; + builder.append("\t"); + builder.append(call.getLkHom1()); + builder.append("\t"); + builder.append(call.getLkHom2()); + builder.append("\t"); + builder.append(call.getLkHet()); + builder.append("\t"); + builder.append(call.getIndelLen1()); + builder.append("\t"); + builder.append(call.getIndelSeq1()); + builder.append("\t"); + builder.append(call.getIndelLen2()); + builder.append("\t"); + builder.append(call.getIndelSeq2()); + } + return builder.toString(); + } + + @Override + public GenomeLoc getLocation() { + return mLoc; + } + + public static Iterator createIterator(String name, File file) { + if (mWrap == null) { + return new RodGLF.GLFWrapper(name, file); + } + return mWrap; + } + + static void setGLFWrapper(GLFRODIterator iter) { + mWrap = iter; + } + + private static class GLFWrapper implements GLFRODIterator { + private GLFReader mReader; + private String mName; + + public GLFWrapper(String name, File file) { + mName = name; + mReader = new GLFReader(file); + } + + @Override + public boolean hasNext() { + return (mReader.hasNext()); + } + + @Override + public RodGLF next() { + RodGLF glf = new RodGLF(mName); + glf.mRecord = mReader.next(); + glf.mLoc = GenomeLocParser.createGenomeLoc(mReader.getReferenceName(), mReader.getCurrentLocation()); + glf.contig = mReader.getReferenceName(); + glf.contigLength = mReader.getReferenceLength(); + return glf; + } + + @Override + public void remove() { + throw new UnsupportedOperationException("GLF Rods don't support the remove() function"); + } + } +} + +// for testing +interface GLFRODIterator extends Iterator { +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java index 8e69af716..6ac926182 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java @@ -325,7 +325,11 @@ public class SingleSampleGenotyper extends LocusWalker { // reference length private int referenceLength; + // the current location, keeping track of the offsets + private int currentLocation = 1; + + // we have this variable becuase there is no eof for glf's + private int lastRecordType = -1; + /** * create a glf reader * * @param readFrom the file to read from */ - GLFReader(File readFrom) { + public GLFReader(File readFrom) { try { inputBinaryCodec = new BinaryCodec(new DataInputStream(new BlockCompressedInputStream(readFrom))); } catch (IOException e) { @@ -146,11 +153,15 @@ public class GLFReader implements Iterator { @Override public GLFRecord next() { - short firstBase = inputBinaryCodec.readUByte(); + GLFRecord ret = nextRecord; + short firstBase = protectedByteReadForFile(); + if (firstBase == -1) return ret; + + // parse out the record type and reference base byte recordType = (byte) ((firstBase & 0x0f0) >> 4); char refBase = (char) (firstBase & 0x000f); + lastRecordType = recordType; - GLFRecord ret = nextRecord; if (recordType == 1) { nextRecord = generateSPC(refBase, inputBinaryCodec); } else if (recordType == 2) { @@ -159,14 +170,31 @@ public class GLFReader implements Iterator { if (advanceContig()) { return next(); } - nextRecord = null; + //nextRecord = null; } else { throw new StingException("Unkonwn GLF record type (type = " + recordType + ")"); } - + if (ret != null) currentLocation += ret.getOffset(); return ret; } + /** + * read a short, and if we see an exception only throw it if it's unexpected (not after a zero) + * @return a short + */ + private short protectedByteReadForFile() { + short st = -1; + try { + st = inputBinaryCodec.readUByte(); + } catch (RuntimeEOFException exp) { + nextRecord = null; + if (lastRecordType != 0) { + throw exp; // if the last record was a zero, this is an ok condition. Otherwise throw an exception + } + } + return st; + } + /** * advance to the next contig * @@ -181,22 +209,22 @@ public class GLFReader implements Iterator { // get the reference length - this may be a problem storing an unsigned int into a signed int. but screw it. referenceLength = (int) inputBinaryCodec.readUInt(); //System.err.println(referenceName.length()); + currentLocation = 1; return true; } catch (RuntimeException e) { - e.printStackTrace(); - // we're out of file space, set the next to null + if (lastRecordType != 0) { + throw e; // if the last record was a zero, this is an ok condition. Otherwise throw an exception + } nextRecord = null; } return false; } - @Override public void remove() { throw new StingException("GLFReader doesn't support remove()"); } - /** * getter methods */ @@ -212,4 +240,8 @@ public class GLFReader implements Iterator { public String getHeaderStr() { return headerStr; } + + public int getCurrentLocation() { + return currentLocation; + } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java index f107286aa..94017c8a7 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java @@ -39,7 +39,7 @@ import org.broadinstitute.sting.utils.StingException; * which represents the accepted FASTA nucleotide symbols and their assocated GLF * field values. */ -abstract class GLFRecord { +public abstract class GLFRecord { // fields common to all records protected REF_BASE refBase; @@ -110,7 +110,7 @@ abstract class GLFRecord { } /** the record type enum, which enumerates the different records we can have in a GLF */ - enum RECORD_TYPE { + public enum RECORD_TYPE { SINGLE((short) 1), VARIABLE((short) 2); @@ -243,5 +243,24 @@ abstract class GLFRecord { return (min > 255) ? 255 : (short)min; } + public REF_BASE getRefBase() { + return refBase; + } + + public long getOffset() { + return offset; + } + + public short getMinimumLikelihood() { + return minimumLikelihood; + } + + public int getReadDepth() { + return readDepth; + } + + public short getRmsMapQ() { + return rmsMapQ; + } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/SinglePointCall.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/SinglePointCall.java index a8fb075c5..03d9d34db 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/SinglePointCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/SinglePointCall.java @@ -35,7 +35,7 @@ import net.sf.samtools.util.BinaryCodec; *

* This class represents a single point geneotype call in GLF vernacular */ -class SinglePointCall extends GLFRecord { +public class SinglePointCall extends GLFRecord { // our likelihoods object private double likelihoods[]; @@ -49,7 +49,7 @@ class SinglePointCall extends GLFRecord { * @param rmsMapQ the root mean square of the mapping quality * @param likelihoods the Likelihoods */ - SinglePointCall(char refBase, int offset, int readDepth, short rmsMapQ, double likelihoods[]) { + public SinglePointCall(char refBase, int offset, int readDepth, short rmsMapQ, double likelihoods[]) { super(refBase, offset, (short) GLFRecord.findMin(likelihoods), readDepth, rmsMapQ); this.likelihoods = likelihoods; } @@ -89,4 +89,7 @@ class SinglePointCall extends GLFRecord { return likelihoods.length + super.getByteSize(); } + public double[] getLikelihoods() { + return likelihoods; + } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/VariableLengthCall.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/VariableLengthCall.java index 57f6d3606..b2109cca4 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/VariableLengthCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/VariableLengthCall.java @@ -36,7 +36,7 @@ import net.sf.samtools.util.BinaryCodec; * Currently a lot of parameters need to be provided, but we may be able to thin * those down as we understand what we have to specify and what we can infer. */ -class VariableLengthCall extends GLFRecord { +public class VariableLengthCall extends GLFRecord { // our fields, corresponding to the glf spec private short lkHom1 = 0; private short lkHom2 = 0; @@ -63,7 +63,7 @@ class VariableLengthCall extends GLFRecord { * @param indelSeq1 the sequence for the first indel allele * @param indelSeq2 the sequence for the second indel allele */ - VariableLengthCall(char refBase, + public VariableLengthCall(char refBase, long offset, int readDepth, short rmsMapQ, @@ -116,4 +116,31 @@ class VariableLengthCall extends GLFRecord { return size + super.getByteSize(); } + public short getLkHom1() { + return lkHom1; + } + + public short getLkHom2() { + return lkHom2; + } + + public short getLkHet() { + return lkHet; + } + + public short[] getIndelSeq1() { + return indelSeq1; + } + + public short[] getIndelSeq2() { + return indelSeq2; + } + + public int getIndelLen2() { + return indelLen2; + } + + public int getIndelLen1() { + return indelLen1; + } } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/RodGLFTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/RodGLFTest.java new file mode 100644 index 000000000..fd8b2e1c7 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/refdata/RodGLFTest.java @@ -0,0 +1,107 @@ +package org.broadinstitute.sting.gatk.refdata; + +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.ReferenceSequenceFileFactory; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.genotype.glf.SinglePointCall; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.File; +import java.util.Iterator; + +/** + * Created by IntelliJ IDEA. + * User: aaronmckenna + * Date: Jul 15, 2009 + * Time: 12:18:50 AM + */ +public class RodGLFTest extends BaseTest { + static final File glfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test_likelihoods.glf"); + static final int finalRecordCount = 484140; // the number of records in the above file + static final int contigCount = 25; + static final String ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; + + @BeforeClass + public static void before() { + ReferenceSequenceFile r = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(ref)); + GenomeLocParser.setupRefContigOrdering(r); + } + + static class TestRODClass implements GLFRODIterator { + private int count = 0; + private int readCount = 0; + + public TestRODClass(int count) { + this.count = count; + } + + @Override + public boolean hasNext() { + if (count <= 0) { + return false; + } + return true; + } + + @Override + public RodGLF next() { + RodGLF glf = new RodGLF("Test"); + glf.mRecord = new SinglePointCall('A', readCount, 0, (short) 0, new double[]{0.0, 0.0}); + --count; + readCount++; + return glf; + } + + @Override + public void remove() { + throw new UnsupportedOperationException("BOO"); + } + } + + //@Test + public void testNext() { + RodGLF.setGLFWrapper(new RodGLFTest.TestRODClass(10)); + Iterator iter = RodGLF.createIterator("test", new File("")); + int counter = 0; + while (iter.hasNext()) { + iter.next(); + counter++; + } + assertEquals(10, counter); + } + + @Test + public void testRodCount() { + + Iterator iter = RodGLF.createIterator("test", glfFile); + int counter = 0; + while (iter.hasNext()) { + RodGLF glf = iter.next(); + counter++; + } + assertEquals(finalRecordCount, counter); + } + + @Test + public void testRodLocations() { + + Iterator iter = RodGLF.createIterator("test", glfFile); + GenomeLoc loc = null; + while (iter.hasNext()) { + RodGLF glf = iter.next(); + if (loc != null) { + if (glf.getLocation().isBefore(loc)) { + fail("locations in the GLF came out of order loc = " + loc.toString() + " new loc = " + glf.getLocation().toString()); + } + } + loc = glf.getLocation(); + } + } + + +} diff --git a/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFReaderTest.java b/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFReaderTest.java new file mode 100644 index 000000000..b0dd75c14 --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/genotype/glf/GLFReaderTest.java @@ -0,0 +1,51 @@ +package org.broadinstitute.sting.utils.genotype.glf; + +import org.broadinstitute.sting.BaseTest; +import org.junit.Assert; +import org.junit.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + + +/** + * @author aaron + *

+ * Class GLFReaderTest + *

+ * A descriptions should go here. Blame aaron if it's missing. + */ +public class GLFReaderTest extends BaseTest { + + + // our test file + static final File glfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test_likelihoods.glf"); + static final int finalRecordCount = 484140; // the number of records in the above file + static final int contigCount = 25; + + /** read in the records from the file */ + @Test + public void testReadRecords() { + int recCount = 0; + List contigs = new ArrayList(); + try { + GLFReader reader = new GLFReader(glfFile); + long location = 1; + while (reader.hasNext()) { + GLFRecord rec = reader.next(); + if (!contigs.contains(reader.getReferenceName())) { + contigs.add(reader.getReferenceName()); + } + location = location + rec.offset; + //System.err.println("Record count = " + finalRecordCount + " offset " + rec.offset + " location = " + location + " type = " + rec.getRecordType()); + ++recCount; + } + } catch (Exception e) { + System.err.println("Record count = " + recCount); + e.printStackTrace(); + } + Assert.assertEquals(finalRecordCount, recCount); + Assert.assertEquals(contigCount, contigs.size()); + } +}