adding GLFRods with tests and some other code changes
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1251 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c25f84a01c
commit
9ecb3e0015
|
|
@ -0,0 +1,127 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.GLFReader;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.GLFRecord;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.SinglePointCall;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.VariableLengthCall;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class RodGLF
|
||||
* <p/>
|
||||
* the rod class for GLF data.
|
||||
*/
|
||||
public class RodGLF extends BasicReferenceOrderedDatum {
|
||||
protected GLFRecord mRecord;
|
||||
private GenomeLoc mLoc;
|
||||
private static GLFRODIterator mWrap;
|
||||
private String contig;
|
||||
private int contigLength;
|
||||
|
||||
public RodGLF(String name) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean parseLine(Object header, String[] parts) throws IOException {
|
||||
return false; //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder builder = new StringBuilder();
|
||||
builder.append(contig);
|
||||
builder.append("\t");
|
||||
builder.append(mRecord.getOffset());
|
||||
builder.append("\t");
|
||||
builder.append(mRecord.getRefBase());
|
||||
builder.append("\t");
|
||||
builder.append(mRecord.getReadDepth());
|
||||
builder.append("\t");
|
||||
builder.append(mRecord.getRmsMapQ());
|
||||
builder.append("\t");
|
||||
builder.append(mRecord.getMinimumLikelihood());
|
||||
|
||||
if (mRecord.getRecordType() == GLFRecord.RECORD_TYPE.SINGLE) {
|
||||
for (double d : ((SinglePointCall) mRecord).getLikelihoods()) {
|
||||
builder.append("\t");
|
||||
builder.append(d);
|
||||
}
|
||||
} else if (mRecord.getRecordType() == GLFRecord.RECORD_TYPE.VARIABLE) {
|
||||
VariableLengthCall call = (VariableLengthCall) mRecord;
|
||||
builder.append("\t");
|
||||
builder.append(call.getLkHom1());
|
||||
builder.append("\t");
|
||||
builder.append(call.getLkHom2());
|
||||
builder.append("\t");
|
||||
builder.append(call.getLkHet());
|
||||
builder.append("\t");
|
||||
builder.append(call.getIndelLen1());
|
||||
builder.append("\t");
|
||||
builder.append(call.getIndelSeq1());
|
||||
builder.append("\t");
|
||||
builder.append(call.getIndelLen2());
|
||||
builder.append("\t");
|
||||
builder.append(call.getIndelSeq2());
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenomeLoc getLocation() {
|
||||
return mLoc;
|
||||
}
|
||||
|
||||
public static Iterator<RodGLF> createIterator(String name, File file) {
|
||||
if (mWrap == null) {
|
||||
return new RodGLF.GLFWrapper(name, file);
|
||||
}
|
||||
return mWrap;
|
||||
}
|
||||
|
||||
static void setGLFWrapper(GLFRODIterator iter) {
|
||||
mWrap = iter;
|
||||
}
|
||||
|
||||
private static class GLFWrapper implements GLFRODIterator {
|
||||
private GLFReader mReader;
|
||||
private String mName;
|
||||
|
||||
public GLFWrapper(String name, File file) {
|
||||
mName = name;
|
||||
mReader = new GLFReader(file);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return (mReader.hasNext());
|
||||
}
|
||||
|
||||
@Override
|
||||
public RodGLF next() {
|
||||
RodGLF glf = new RodGLF(mName);
|
||||
glf.mRecord = mReader.next();
|
||||
glf.mLoc = GenomeLocParser.createGenomeLoc(mReader.getReferenceName(), mReader.getCurrentLocation());
|
||||
glf.contig = mReader.getReferenceName();
|
||||
glf.contigLength = mReader.getReferenceLength();
|
||||
return glf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("GLF Rods don't support the remove() function");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// for testing
|
||||
interface GLFRODIterator extends Iterator<RodGLF> {
|
||||
}
|
||||
|
|
@ -325,7 +325,11 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
|
|||
variantsOut.println(alleleFreq.asTabularString());
|
||||
} else if (this.VAR_FORMAT == GenotypeWriterFactory.GENOTYPE_FORMAT.GLF) {
|
||||
SAMSequenceRecord rec = GenomeLocParser.getContigInfo(alleleFreq.location.getContig());
|
||||
for (int x = 0; x < alleleFreq.posteriors.length; x++) {
|
||||
alleleFreq.posteriors[x] *= 10;
|
||||
}
|
||||
LikelihoodObject obj = new LikelihoodObject(alleleFreq.posteriors, LikelihoodObject.LIKELIHOOD_TYPE.LOG);
|
||||
|
||||
obj.setLikelihoodType(LikelihoodObject.LIKELIHOOD_TYPE.NEGITIVE_LOG);
|
||||
this.mGenotypeWriter.addGenotypeCall(rec,(int)alleleFreq.location.getStart(),0.0f,alleleFreq.ref,alleleFreq.depth,obj);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.utils.genotype.glf;
|
|||
|
||||
import net.sf.samtools.util.BinaryCodec;
|
||||
import net.sf.samtools.util.BlockCompressedInputStream;
|
||||
import net.sf.samtools.util.RuntimeEOFException;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
|
||||
|
||||
|
|
@ -56,12 +57,18 @@ public class GLFReader implements Iterator<GLFRecord> {
|
|||
// reference length
|
||||
private int referenceLength;
|
||||
|
||||
// the current location, keeping track of the offsets
|
||||
private int currentLocation = 1;
|
||||
|
||||
// we have this variable becuase there is no eof for glf's
|
||||
private int lastRecordType = -1;
|
||||
|
||||
/**
|
||||
* create a glf reader
|
||||
*
|
||||
* @param readFrom the file to read from
|
||||
*/
|
||||
GLFReader(File readFrom) {
|
||||
public GLFReader(File readFrom) {
|
||||
try {
|
||||
inputBinaryCodec = new BinaryCodec(new DataInputStream(new BlockCompressedInputStream(readFrom)));
|
||||
} catch (IOException e) {
|
||||
|
|
@ -146,11 +153,15 @@ public class GLFReader implements Iterator<GLFRecord> {
|
|||
|
||||
@Override
|
||||
public GLFRecord next() {
|
||||
short firstBase = inputBinaryCodec.readUByte();
|
||||
GLFRecord ret = nextRecord;
|
||||
short firstBase = protectedByteReadForFile();
|
||||
if (firstBase == -1) return ret;
|
||||
|
||||
// parse out the record type and reference base
|
||||
byte recordType = (byte) ((firstBase & 0x0f0) >> 4);
|
||||
char refBase = (char) (firstBase & 0x000f);
|
||||
lastRecordType = recordType;
|
||||
|
||||
GLFRecord ret = nextRecord;
|
||||
if (recordType == 1) {
|
||||
nextRecord = generateSPC(refBase, inputBinaryCodec);
|
||||
} else if (recordType == 2) {
|
||||
|
|
@ -159,14 +170,31 @@ public class GLFReader implements Iterator<GLFRecord> {
|
|||
if (advanceContig()) {
|
||||
return next();
|
||||
}
|
||||
nextRecord = null;
|
||||
//nextRecord = null;
|
||||
} else {
|
||||
throw new StingException("Unkonwn GLF record type (type = " + recordType + ")");
|
||||
}
|
||||
|
||||
if (ret != null) currentLocation += ret.getOffset();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* read a short, and if we see an exception only throw it if it's unexpected (not after a zero)
|
||||
* @return a short
|
||||
*/
|
||||
private short protectedByteReadForFile() {
|
||||
short st = -1;
|
||||
try {
|
||||
st = inputBinaryCodec.readUByte();
|
||||
} catch (RuntimeEOFException exp) {
|
||||
nextRecord = null;
|
||||
if (lastRecordType != 0) {
|
||||
throw exp; // if the last record was a zero, this is an ok condition. Otherwise throw an exception
|
||||
}
|
||||
}
|
||||
return st;
|
||||
}
|
||||
|
||||
/**
|
||||
* advance to the next contig
|
||||
*
|
||||
|
|
@ -181,22 +209,22 @@ public class GLFReader implements Iterator<GLFRecord> {
|
|||
// get the reference length - this may be a problem storing an unsigned int into a signed int. but screw it.
|
||||
referenceLength = (int) inputBinaryCodec.readUInt();
|
||||
//System.err.println(referenceName.length());
|
||||
currentLocation = 1;
|
||||
return true;
|
||||
} catch (RuntimeException e) {
|
||||
e.printStackTrace();
|
||||
// we're out of file space, set the next to null
|
||||
if (lastRecordType != 0) {
|
||||
throw e; // if the last record was a zero, this is an ok condition. Otherwise throw an exception
|
||||
}
|
||||
nextRecord = null;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new StingException("GLFReader doesn't support remove()");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* getter methods
|
||||
*/
|
||||
|
|
@ -212,4 +240,8 @@ public class GLFReader implements Iterator<GLFRecord> {
|
|||
public String getHeaderStr() {
|
||||
return headerStr;
|
||||
}
|
||||
|
||||
public int getCurrentLocation() {
|
||||
return currentLocation;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ import org.broadinstitute.sting.utils.StingException;
|
|||
* which represents the accepted FASTA nucleotide symbols and their assocated GLF
|
||||
* field values.
|
||||
*/
|
||||
abstract class GLFRecord {
|
||||
public abstract class GLFRecord {
|
||||
|
||||
// fields common to all records
|
||||
protected REF_BASE refBase;
|
||||
|
|
@ -110,7 +110,7 @@ abstract class GLFRecord {
|
|||
}
|
||||
|
||||
/** the record type enum, which enumerates the different records we can have in a GLF */
|
||||
enum RECORD_TYPE {
|
||||
public enum RECORD_TYPE {
|
||||
SINGLE((short) 1),
|
||||
VARIABLE((short) 2);
|
||||
|
||||
|
|
@ -243,5 +243,24 @@ abstract class GLFRecord {
|
|||
return (min > 255) ? 255 : (short)min;
|
||||
}
|
||||
|
||||
public REF_BASE getRefBase() {
|
||||
return refBase;
|
||||
}
|
||||
|
||||
public long getOffset() {
|
||||
return offset;
|
||||
}
|
||||
|
||||
public short getMinimumLikelihood() {
|
||||
return minimumLikelihood;
|
||||
}
|
||||
|
||||
public int getReadDepth() {
|
||||
return readDepth;
|
||||
}
|
||||
|
||||
public short getRmsMapQ() {
|
||||
return rmsMapQ;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ import net.sf.samtools.util.BinaryCodec;
|
|||
* <p/>
|
||||
* This class represents a single point geneotype call in GLF vernacular
|
||||
*/
|
||||
class SinglePointCall extends GLFRecord {
|
||||
public class SinglePointCall extends GLFRecord {
|
||||
|
||||
// our likelihoods object
|
||||
private double likelihoods[];
|
||||
|
|
@ -49,7 +49,7 @@ class SinglePointCall extends GLFRecord {
|
|||
* @param rmsMapQ the root mean square of the mapping quality
|
||||
* @param likelihoods the Likelihoods
|
||||
*/
|
||||
SinglePointCall(char refBase, int offset, int readDepth, short rmsMapQ, double likelihoods[]) {
|
||||
public SinglePointCall(char refBase, int offset, int readDepth, short rmsMapQ, double likelihoods[]) {
|
||||
super(refBase, offset, (short) GLFRecord.findMin(likelihoods), readDepth, rmsMapQ);
|
||||
this.likelihoods = likelihoods;
|
||||
}
|
||||
|
|
@ -89,4 +89,7 @@ class SinglePointCall extends GLFRecord {
|
|||
return likelihoods.length + super.getByteSize();
|
||||
}
|
||||
|
||||
public double[] getLikelihoods() {
|
||||
return likelihoods;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ import net.sf.samtools.util.BinaryCodec;
|
|||
* Currently a lot of parameters need to be provided, but we may be able to thin
|
||||
* those down as we understand what we have to specify and what we can infer.
|
||||
*/
|
||||
class VariableLengthCall extends GLFRecord {
|
||||
public class VariableLengthCall extends GLFRecord {
|
||||
// our fields, corresponding to the glf spec
|
||||
private short lkHom1 = 0;
|
||||
private short lkHom2 = 0;
|
||||
|
|
@ -63,7 +63,7 @@ class VariableLengthCall extends GLFRecord {
|
|||
* @param indelSeq1 the sequence for the first indel allele
|
||||
* @param indelSeq2 the sequence for the second indel allele
|
||||
*/
|
||||
VariableLengthCall(char refBase,
|
||||
public VariableLengthCall(char refBase,
|
||||
long offset,
|
||||
int readDepth,
|
||||
short rmsMapQ,
|
||||
|
|
@ -116,4 +116,31 @@ class VariableLengthCall extends GLFRecord {
|
|||
return size + super.getByteSize();
|
||||
}
|
||||
|
||||
public short getLkHom1() {
|
||||
return lkHom1;
|
||||
}
|
||||
|
||||
public short getLkHom2() {
|
||||
return lkHom2;
|
||||
}
|
||||
|
||||
public short getLkHet() {
|
||||
return lkHet;
|
||||
}
|
||||
|
||||
public short[] getIndelSeq1() {
|
||||
return indelSeq1;
|
||||
}
|
||||
|
||||
public short[] getIndelSeq2() {
|
||||
return indelSeq2;
|
||||
}
|
||||
|
||||
public int getIndelLen2() {
|
||||
return indelLen2;
|
||||
}
|
||||
|
||||
public int getIndelLen1() {
|
||||
return indelLen1;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,107 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.SinglePointCall;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.fail;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: aaronmckenna
|
||||
* Date: Jul 15, 2009
|
||||
* Time: 12:18:50 AM
|
||||
*/
|
||||
public class RodGLFTest extends BaseTest {
|
||||
static final File glfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test_likelihoods.glf");
|
||||
static final int finalRecordCount = 484140; // the number of records in the above file
|
||||
static final int contigCount = 25;
|
||||
static final String ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
|
||||
|
||||
@BeforeClass
|
||||
public static void before() {
|
||||
ReferenceSequenceFile r = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(ref));
|
||||
GenomeLocParser.setupRefContigOrdering(r);
|
||||
}
|
||||
|
||||
static class TestRODClass implements GLFRODIterator {
|
||||
private int count = 0;
|
||||
private int readCount = 0;
|
||||
|
||||
public TestRODClass(int count) {
|
||||
this.count = count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (count <= 0) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RodGLF next() {
|
||||
RodGLF glf = new RodGLF("Test");
|
||||
glf.mRecord = new SinglePointCall('A', readCount, 0, (short) 0, new double[]{0.0, 0.0});
|
||||
--count;
|
||||
readCount++;
|
||||
return glf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("BOO");
|
||||
}
|
||||
}
|
||||
|
||||
//@Test
|
||||
public void testNext() {
|
||||
RodGLF.setGLFWrapper(new RodGLFTest.TestRODClass(10));
|
||||
Iterator<RodGLF> iter = RodGLF.createIterator("test", new File(""));
|
||||
int counter = 0;
|
||||
while (iter.hasNext()) {
|
||||
iter.next();
|
||||
counter++;
|
||||
}
|
||||
assertEquals(10, counter);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRodCount() {
|
||||
|
||||
Iterator<RodGLF> iter = RodGLF.createIterator("test", glfFile);
|
||||
int counter = 0;
|
||||
while (iter.hasNext()) {
|
||||
RodGLF glf = iter.next();
|
||||
counter++;
|
||||
}
|
||||
assertEquals(finalRecordCount, counter);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRodLocations() {
|
||||
|
||||
Iterator<RodGLF> iter = RodGLF.createIterator("test", glfFile);
|
||||
GenomeLoc loc = null;
|
||||
while (iter.hasNext()) {
|
||||
RodGLF glf = iter.next();
|
||||
if (loc != null) {
|
||||
if (glf.getLocation().isBefore(loc)) {
|
||||
fail("locations in the GLF came out of order loc = " + loc.toString() + " new loc = " + glf.getLocation().toString());
|
||||
}
|
||||
}
|
||||
loc = glf.getLocation();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
package org.broadinstitute.sting.utils.genotype.glf;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class GLFReaderTest
|
||||
* <p/>
|
||||
* A descriptions should go here. Blame aaron if it's missing.
|
||||
*/
|
||||
public class GLFReaderTest extends BaseTest {
|
||||
|
||||
|
||||
// our test file
|
||||
static final File glfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test_likelihoods.glf");
|
||||
static final int finalRecordCount = 484140; // the number of records in the above file
|
||||
static final int contigCount = 25;
|
||||
|
||||
/** read in the records from the file */
|
||||
@Test
|
||||
public void testReadRecords() {
|
||||
int recCount = 0;
|
||||
List<String> contigs = new ArrayList<String>();
|
||||
try {
|
||||
GLFReader reader = new GLFReader(glfFile);
|
||||
long location = 1;
|
||||
while (reader.hasNext()) {
|
||||
GLFRecord rec = reader.next();
|
||||
if (!contigs.contains(reader.getReferenceName())) {
|
||||
contigs.add(reader.getReferenceName());
|
||||
}
|
||||
location = location + rec.offset;
|
||||
//System.err.println("Record count = " + finalRecordCount + " offset " + rec.offset + " location = " + location + " type = " + rec.getRecordType());
|
||||
++recCount;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.err.println("Record count = " + recCount);
|
||||
e.printStackTrace();
|
||||
}
|
||||
Assert.assertEquals(finalRecordCount, recCount);
|
||||
Assert.assertEquals(contigCount, contigs.size());
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue