adding GLFRods with tests and some other code changes

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1251 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-07-15 15:30:19 +00:00
parent c25f84a01c
commit 9ecb3e0015
8 changed files with 385 additions and 15 deletions

View File

@ -0,0 +1,127 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.genotype.glf.GLFReader;
import org.broadinstitute.sting.utils.genotype.glf.GLFRecord;
import org.broadinstitute.sting.utils.genotype.glf.SinglePointCall;
import org.broadinstitute.sting.utils.genotype.glf.VariableLengthCall;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
/**
* @author aaron
* <p/>
* Class RodGLF
* <p/>
* the rod class for GLF data.
*/
public class RodGLF extends BasicReferenceOrderedDatum {
protected GLFRecord mRecord;
private GenomeLoc mLoc;
private static GLFRODIterator mWrap;
private String contig;
private int contigLength;
public RodGLF(String name) {
super(name);
}
@Override
public boolean parseLine(Object header, String[] parts) throws IOException {
return false; //To change body of implemented methods use File | Settings | File Templates.
}
@Override
public String toString() {
final StringBuilder builder = new StringBuilder();
builder.append(contig);
builder.append("\t");
builder.append(mRecord.getOffset());
builder.append("\t");
builder.append(mRecord.getRefBase());
builder.append("\t");
builder.append(mRecord.getReadDepth());
builder.append("\t");
builder.append(mRecord.getRmsMapQ());
builder.append("\t");
builder.append(mRecord.getMinimumLikelihood());
if (mRecord.getRecordType() == GLFRecord.RECORD_TYPE.SINGLE) {
for (double d : ((SinglePointCall) mRecord).getLikelihoods()) {
builder.append("\t");
builder.append(d);
}
} else if (mRecord.getRecordType() == GLFRecord.RECORD_TYPE.VARIABLE) {
VariableLengthCall call = (VariableLengthCall) mRecord;
builder.append("\t");
builder.append(call.getLkHom1());
builder.append("\t");
builder.append(call.getLkHom2());
builder.append("\t");
builder.append(call.getLkHet());
builder.append("\t");
builder.append(call.getIndelLen1());
builder.append("\t");
builder.append(call.getIndelSeq1());
builder.append("\t");
builder.append(call.getIndelLen2());
builder.append("\t");
builder.append(call.getIndelSeq2());
}
return builder.toString();
}
@Override
public GenomeLoc getLocation() {
return mLoc;
}
public static Iterator<RodGLF> createIterator(String name, File file) {
if (mWrap == null) {
return new RodGLF.GLFWrapper(name, file);
}
return mWrap;
}
static void setGLFWrapper(GLFRODIterator iter) {
mWrap = iter;
}
private static class GLFWrapper implements GLFRODIterator {
private GLFReader mReader;
private String mName;
public GLFWrapper(String name, File file) {
mName = name;
mReader = new GLFReader(file);
}
@Override
public boolean hasNext() {
return (mReader.hasNext());
}
@Override
public RodGLF next() {
RodGLF glf = new RodGLF(mName);
glf.mRecord = mReader.next();
glf.mLoc = GenomeLocParser.createGenomeLoc(mReader.getReferenceName(), mReader.getCurrentLocation());
glf.contig = mReader.getReferenceName();
glf.contigLength = mReader.getReferenceLength();
return glf;
}
@Override
public void remove() {
throw new UnsupportedOperationException("GLF Rods don't support the remove() function");
}
}
}
// for testing
interface GLFRODIterator extends Iterator<RodGLF> {
}

View File

@ -325,7 +325,11 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
variantsOut.println(alleleFreq.asTabularString());
} else if (this.VAR_FORMAT == GenotypeWriterFactory.GENOTYPE_FORMAT.GLF) {
SAMSequenceRecord rec = GenomeLocParser.getContigInfo(alleleFreq.location.getContig());
for (int x = 0; x < alleleFreq.posteriors.length; x++) {
alleleFreq.posteriors[x] *= 10;
}
LikelihoodObject obj = new LikelihoodObject(alleleFreq.posteriors, LikelihoodObject.LIKELIHOOD_TYPE.LOG);
obj.setLikelihoodType(LikelihoodObject.LIKELIHOOD_TYPE.NEGITIVE_LOG);
this.mGenotypeWriter.addGenotypeCall(rec,(int)alleleFreq.location.getStart(),0.0f,alleleFreq.ref,alleleFreq.depth,obj);
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.utils.genotype.glf;
import net.sf.samtools.util.BinaryCodec;
import net.sf.samtools.util.BlockCompressedInputStream;
import net.sf.samtools.util.RuntimeEOFException;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
@ -56,12 +57,18 @@ public class GLFReader implements Iterator<GLFRecord> {
// reference length
private int referenceLength;
// the current location, keeping track of the offsets
private int currentLocation = 1;
// we have this variable becuase there is no eof for glf's
private int lastRecordType = -1;
/**
* create a glf reader
*
* @param readFrom the file to read from
*/
GLFReader(File readFrom) {
public GLFReader(File readFrom) {
try {
inputBinaryCodec = new BinaryCodec(new DataInputStream(new BlockCompressedInputStream(readFrom)));
} catch (IOException e) {
@ -146,11 +153,15 @@ public class GLFReader implements Iterator<GLFRecord> {
@Override
public GLFRecord next() {
short firstBase = inputBinaryCodec.readUByte();
GLFRecord ret = nextRecord;
short firstBase = protectedByteReadForFile();
if (firstBase == -1) return ret;
// parse out the record type and reference base
byte recordType = (byte) ((firstBase & 0x0f0) >> 4);
char refBase = (char) (firstBase & 0x000f);
lastRecordType = recordType;
GLFRecord ret = nextRecord;
if (recordType == 1) {
nextRecord = generateSPC(refBase, inputBinaryCodec);
} else if (recordType == 2) {
@ -159,14 +170,31 @@ public class GLFReader implements Iterator<GLFRecord> {
if (advanceContig()) {
return next();
}
nextRecord = null;
//nextRecord = null;
} else {
throw new StingException("Unkonwn GLF record type (type = " + recordType + ")");
}
if (ret != null) currentLocation += ret.getOffset();
return ret;
}
/**
* read a short, and if we see an exception only throw it if it's unexpected (not after a zero)
* @return a short
*/
private short protectedByteReadForFile() {
short st = -1;
try {
st = inputBinaryCodec.readUByte();
} catch (RuntimeEOFException exp) {
nextRecord = null;
if (lastRecordType != 0) {
throw exp; // if the last record was a zero, this is an ok condition. Otherwise throw an exception
}
}
return st;
}
/**
* advance to the next contig
*
@ -181,22 +209,22 @@ public class GLFReader implements Iterator<GLFRecord> {
// get the reference length - this may be a problem storing an unsigned int into a signed int. but screw it.
referenceLength = (int) inputBinaryCodec.readUInt();
//System.err.println(referenceName.length());
currentLocation = 1;
return true;
} catch (RuntimeException e) {
e.printStackTrace();
// we're out of file space, set the next to null
if (lastRecordType != 0) {
throw e; // if the last record was a zero, this is an ok condition. Otherwise throw an exception
}
nextRecord = null;
}
return false;
}
@Override
public void remove() {
throw new StingException("GLFReader doesn't support remove()");
}
/**
* getter methods
*/
@ -212,4 +240,8 @@ public class GLFReader implements Iterator<GLFRecord> {
public String getHeaderStr() {
return headerStr;
}
public int getCurrentLocation() {
return currentLocation;
}
}

View File

@ -39,7 +39,7 @@ import org.broadinstitute.sting.utils.StingException;
* which represents the accepted FASTA nucleotide symbols and their assocated GLF
* field values.
*/
abstract class GLFRecord {
public abstract class GLFRecord {
// fields common to all records
protected REF_BASE refBase;
@ -110,7 +110,7 @@ abstract class GLFRecord {
}
/** the record type enum, which enumerates the different records we can have in a GLF */
enum RECORD_TYPE {
public enum RECORD_TYPE {
SINGLE((short) 1),
VARIABLE((short) 2);
@ -243,5 +243,24 @@ abstract class GLFRecord {
return (min > 255) ? 255 : (short)min;
}
public REF_BASE getRefBase() {
return refBase;
}
public long getOffset() {
return offset;
}
public short getMinimumLikelihood() {
return minimumLikelihood;
}
public int getReadDepth() {
return readDepth;
}
public short getRmsMapQ() {
return rmsMapQ;
}
}

View File

@ -35,7 +35,7 @@ import net.sf.samtools.util.BinaryCodec;
* <p/>
* This class represents a single point geneotype call in GLF vernacular
*/
class SinglePointCall extends GLFRecord {
public class SinglePointCall extends GLFRecord {
// our likelihoods object
private double likelihoods[];
@ -49,7 +49,7 @@ class SinglePointCall extends GLFRecord {
* @param rmsMapQ the root mean square of the mapping quality
* @param likelihoods the Likelihoods
*/
SinglePointCall(char refBase, int offset, int readDepth, short rmsMapQ, double likelihoods[]) {
public SinglePointCall(char refBase, int offset, int readDepth, short rmsMapQ, double likelihoods[]) {
super(refBase, offset, (short) GLFRecord.findMin(likelihoods), readDepth, rmsMapQ);
this.likelihoods = likelihoods;
}
@ -89,4 +89,7 @@ class SinglePointCall extends GLFRecord {
return likelihoods.length + super.getByteSize();
}
public double[] getLikelihoods() {
return likelihoods;
}
}

View File

@ -36,7 +36,7 @@ import net.sf.samtools.util.BinaryCodec;
* Currently a lot of parameters need to be provided, but we may be able to thin
* those down as we understand what we have to specify and what we can infer.
*/
class VariableLengthCall extends GLFRecord {
public class VariableLengthCall extends GLFRecord {
// our fields, corresponding to the glf spec
private short lkHom1 = 0;
private short lkHom2 = 0;
@ -63,7 +63,7 @@ class VariableLengthCall extends GLFRecord {
* @param indelSeq1 the sequence for the first indel allele
* @param indelSeq2 the sequence for the second indel allele
*/
VariableLengthCall(char refBase,
public VariableLengthCall(char refBase,
long offset,
int readDepth,
short rmsMapQ,
@ -116,4 +116,31 @@ class VariableLengthCall extends GLFRecord {
return size + super.getByteSize();
}
public short getLkHom1() {
return lkHom1;
}
public short getLkHom2() {
return lkHom2;
}
public short getLkHet() {
return lkHet;
}
public short[] getIndelSeq1() {
return indelSeq1;
}
public short[] getIndelSeq2() {
return indelSeq2;
}
public int getIndelLen2() {
return indelLen2;
}
public int getIndelLen1() {
return indelLen1;
}
}

View File

@ -0,0 +1,107 @@
package org.broadinstitute.sting.gatk.refdata;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.genotype.glf.SinglePointCall;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.util.Iterator;
/**
* Created by IntelliJ IDEA.
* User: aaronmckenna
* Date: Jul 15, 2009
* Time: 12:18:50 AM
*/
public class RodGLFTest extends BaseTest {
static final File glfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test_likelihoods.glf");
static final int finalRecordCount = 484140; // the number of records in the above file
static final int contigCount = 25;
static final String ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
@BeforeClass
public static void before() {
ReferenceSequenceFile r = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(ref));
GenomeLocParser.setupRefContigOrdering(r);
}
static class TestRODClass implements GLFRODIterator {
private int count = 0;
private int readCount = 0;
public TestRODClass(int count) {
this.count = count;
}
@Override
public boolean hasNext() {
if (count <= 0) {
return false;
}
return true;
}
@Override
public RodGLF next() {
RodGLF glf = new RodGLF("Test");
glf.mRecord = new SinglePointCall('A', readCount, 0, (short) 0, new double[]{0.0, 0.0});
--count;
readCount++;
return glf;
}
@Override
public void remove() {
throw new UnsupportedOperationException("BOO");
}
}
//@Test
public void testNext() {
RodGLF.setGLFWrapper(new RodGLFTest.TestRODClass(10));
Iterator<RodGLF> iter = RodGLF.createIterator("test", new File(""));
int counter = 0;
while (iter.hasNext()) {
iter.next();
counter++;
}
assertEquals(10, counter);
}
@Test
public void testRodCount() {
Iterator<RodGLF> iter = RodGLF.createIterator("test", glfFile);
int counter = 0;
while (iter.hasNext()) {
RodGLF glf = iter.next();
counter++;
}
assertEquals(finalRecordCount, counter);
}
@Test
public void testRodLocations() {
Iterator<RodGLF> iter = RodGLF.createIterator("test", glfFile);
GenomeLoc loc = null;
while (iter.hasNext()) {
RodGLF glf = iter.next();
if (loc != null) {
if (glf.getLocation().isBefore(loc)) {
fail("locations in the GLF came out of order loc = " + loc.toString() + " new loc = " + glf.getLocation().toString());
}
}
loc = glf.getLocation();
}
}
}

View File

@ -0,0 +1,51 @@
package org.broadinstitute.sting.utils.genotype.glf;
import org.broadinstitute.sting.BaseTest;
import org.junit.Assert;
import org.junit.Test;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* @author aaron
* <p/>
* Class GLFReaderTest
* <p/>
* A descriptions should go here. Blame aaron if it's missing.
*/
public class GLFReaderTest extends BaseTest {
// our test file
static final File glfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test_likelihoods.glf");
static final int finalRecordCount = 484140; // the number of records in the above file
static final int contigCount = 25;
/** read in the records from the file */
@Test
public void testReadRecords() {
int recCount = 0;
List<String> contigs = new ArrayList<String>();
try {
GLFReader reader = new GLFReader(glfFile);
long location = 1;
while (reader.hasNext()) {
GLFRecord rec = reader.next();
if (!contigs.contains(reader.getReferenceName())) {
contigs.add(reader.getReferenceName());
}
location = location + rec.offset;
//System.err.println("Record count = " + finalRecordCount + " offset " + rec.offset + " location = " + location + " type = " + rec.getRecordType());
++recCount;
}
} catch (Exception e) {
System.err.println("Record count = " + recCount);
e.printStackTrace();
}
Assert.assertEquals(finalRecordCount, recCount);
Assert.assertEquals(contigCount, contigs.size());
}
}