GLF to variant context. Added some methods in GLF to aid testing; and added a test that reads GLF, converts to VC, writes GLF and reads back to compare.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3062 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-03-23 03:43:25 +00:00
parent 3767adb0bb
commit eafdd047f7
6 changed files with 294 additions and 26 deletions

View File

@ -1,12 +1,16 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.utils.genotype.CalledGenotype;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.MutableGenotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.CalledGenotype;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
import org.broadinstitute.sting.utils.genotype.glf.GLFSingleCall;
import org.broadinstitute.sting.utils.genotype.glf.GLFWriter;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.*;
@ -38,6 +42,8 @@ public class VariantContextAdaptors {
adaptors.put(RodVCF.class, new RodVCFAdaptor());
adaptors.put(VCFRecord.class, new VCFRecordAdaptor());
adaptors.put(PlinkRod.class, new PlinkRodAdaptor());
adaptors.put(RodGLF.class, new GLFAdaptor());
// adaptors.put(RodGeliText.class, new GeliAdaptor());
}
public static boolean canBeConvertedToVariantContext(Object variantContainingObject) {
@ -425,4 +431,115 @@ public class VariantContextAdaptors {
}
}
// --------------------------------------------------------------------------------------------------------------
//
// GLF to VariantContext
//
// --------------------------------------------------------------------------------------------------------------
private static class GLFAdaptor extends VCAdaptor {
/**
* convert to a Variant Context, given:
* @param name the name of the ROD
* @param input the Rod object, in this case a RodGLF
* @return a VariantContext object
*/
VariantContext convert(String name, Object input) {
if ( ! Allele.acceptableAlleleBases(((RodGLF)input).getReference()) )
return null;
Allele refAllele = new Allele(((RodGLF)input).getReference(), true);
return convert(name, input, refAllele);
}
/**
* convert to a Variant Context, given:
* @param name the name of the ROD
* @param input the Rod object, in this case a RodGLF
* @param refAllele the reference base as an Allele object
* @return a VariantContext object
*/
VariantContext convert(String name, Object input, Allele refAllele) {
RodGLF glf = (RodGLF)input;
// make sure we can convert it
if ( glf.isSNP() || glf.isIndel()) {
// add the reference allele
List<Allele> alleles = new ArrayList<Allele>();
alleles.add(refAllele);
// add all of the alt alleles
for ( String alt : glf.getAlternateAlleleList() ) {
if ( ! Allele.acceptableAlleleBases(alt) ) {
return null;
}
Allele allele = new Allele(alt, false);
if (!alleles.contains(allele)) alleles.add(allele);
}
Map<String, String> attributes = new HashMap<String, String>();
Collection<Genotype> genotypes = new ArrayList<Genotype>();
MutableGenotype call = new MutableGenotype(name, alleles);
if (glf.mRecord instanceof GLFSingleCall) {
// transform the likelihoods from negative log (positive double values) to log values (negitive values)
LikelihoodObject obj = new LikelihoodObject(((GLFSingleCall)glf.mRecord).getLikelihoods(), LikelihoodObject.LIKELIHOOD_TYPE.NEGATIVE_LOG);
obj.setLikelihoodType(LikelihoodObject.LIKELIHOOD_TYPE.LOG);
// set the likelihoods, depth, and RMS mapping quality values
call.putAttribute(CalledGenotype.LIKELIHOODS_ATTRIBUTE_KEY,obj.toDoubleArray());
call.putAttribute(VCFGenotypeRecord.DEPTH_KEY,(glf.mRecord.getReadDepth()));
call.putAttribute(GLFWriter.RMS_MAPPING_QUAL, (double) glf.mRecord.getRmsMapQ());
} else {
throw new UnsupportedOperationException("We don't currenly support indel calls");
}
// add the call to the genotype list, and then use this list to create a VariantContext
genotypes.add(call);
VariantContext vc = new VariantContext(name, glf.getLocation(), alleles, genotypes, glf.getNegLog10PError(), null, attributes);
vc.validate();
return vc;
} else
return null; // can't handle anything else
}
}
// --------------------------------------------------------------------------------------------------------------
//
// GELI to VariantContext
//
// --------------------------------------------------------------------------------------------------------------
/*
private static class GeliAdaptor extends VCAdaptor {
VariantContext convert(String name, Object input) {
if (!Allele.acceptableAlleleBases(((RodGeliText) input).getReference()))
return null;
Allele refAllele = new Allele(((RodGeliText) input).getReference(), true);
return convert(name, input, refAllele);
}
VariantContext convert(String name, Object input, Allele refAllele) {
RodGeliText geliText = (RodGeliText) input;
if (geliText.isSNP() || geliText.isIndel()) {
// add the reference allele
List<Allele> alleles = new ArrayList<Allele>();
alleles.add(refAllele);
// add all of the alt alleles
for (String alt : geliText.getAlternateAlleleList()) {
if (!Allele.acceptableAlleleBases(alt)) {
return null;
}
alleles.add(new Allele(alt, false));
}
Map<String, String> attributes = new HashMap<String, String>();
attributes.put("ID", geliText.getName());
Collection<Genotype> genotypes = null;
VariantContext vc = new VariantContext(name, geliText.getLocation(), alleles, genotypes, geliText.getNegLog10PError(), null, attributes);
vc.validate();
return vc;
} else
return null; // can't handle anything else
}
}*/
}

View File

@ -50,9 +50,6 @@ public abstract class GLFRecord {
protected int readDepth = 0;
protected short rmsMapQ = 0;
// the size of this base structure
protected final int baseSize = 10;
/** the reference base enumeration, with their short (type) values for GLF */
public enum REF_BASE {
X((short) 0x00),
@ -204,19 +201,19 @@ public abstract class GLFRecord {
* write the this record to a binary codec output.
*
* @param out the binary codec to write to
* @param lastRecord the record to write
*/
void write(BinaryCodec out, GLFRecord lastRecord) {
long offset = 0;
if (lastRecord != null && lastRecord.getContig() == this.getContig())
long offset;
if (lastRecord != null && lastRecord.getContig().equals(this.getContig()))
offset = this.position - lastRecord.getPosition();
else
offset = this.position - 1; // we start at one, we need to subtract that off
short bite = ((short) (this.getRecordType().getReadTypeValue() << 4 | (refBase.getBaseHexValue() & 0x0f)));
out.writeUByte((short) (this.getRecordType().getReadTypeValue() << 4 | (refBase.getBaseHexValue() & 0x0f)));
out.writeUInt(((Long) (offset)).intValue()); // we have to subtract one, we're an offset
long write = (long) ((long) (readDepth & 0xffffff) | (long) (this.getMinimumLikelihood() & 0xff) << 24);
long write = ((long) (readDepth & 0xffffff) | (long) (this.getMinimumLikelihood() & 0xff) << 24);
out.writeUInt(write);
out.writeUByte((short) rmsMapQ);
out.writeUByte(rmsMapQ);
}
/**
@ -249,9 +246,9 @@ public abstract class GLFRecord {
/**
* find the minimum value in a set of doubles
*
* @param vals
* @param vals the array of values
*
* @return
* @return the minimum value
*/
protected static double findMin(double vals[]) {
if (vals.length < 1) throw new StingException("findMin: an array of size < 1 was passed in");
@ -293,5 +290,16 @@ public abstract class GLFRecord {
* @return a short of the minimum likelihood.
*/
protected abstract short calculateMinLikelihood();
public boolean equals(GLFRecord rec) {
return (rec != null) &&
contig.equals(rec.getContig()) &&
(refBase == rec.getRefBase()) &&
(position == rec.getPosition()) &&
(readDepth == rec.getReadDepth()) &&
(rmsMapQ == rec.getRmsMapQ());
}
}

View File

@ -116,6 +116,16 @@ public class GLFSingleCall extends GLFRecord {
return toCappedShort(minLikelihood);
}
@Override
public boolean equals(GLFRecord rec) {
if (!super.equals(rec)) return false;
if (!(rec instanceof GLFSingleCall)) return false;
if (((GLFSingleCall) rec).getLikelihoods().length != this.likelihoods.length) return false;
for (int x = 0; x < likelihoods.length; x++)
if (Double.compare(likelihoods[x],((GLFSingleCall) rec).getLikelihoods()[x]) != 0) return false;
return this.getMinimumLikelihood() == rec.getMinimumLikelihood();
}
public double[] getLikelihoods() {
return likelihoods;
}

View File

@ -56,8 +56,6 @@ public class GLFVariableLengthCall extends GLFRecord {
* @param contig the contig this record is on
* @param refBase the reference base
* @param offset the location, as an offset from the previous glf record
* @param offset the location, as an offset from the previous glf record
s
* @param readDepth the read depth at the specified postion
* @param rmsMapQ the root mean square of the mapping quality
* @param lkHom1 the negitive log likelihood of the first homozygous indel allele, from 0 to 255
@ -158,4 +156,19 @@ public class GLFVariableLengthCall extends GLFRecord {
public int getIndelLen1() {
return indelLen1;
}
public boolean equals(GLFRecord rec) {
if (!super.equals(rec)) return false;
if (!(rec instanceof GLFVariableLengthCall)) return false;
if (lkHom1 != ((GLFVariableLengthCall) rec).getLkHom1()) return false;
if (lkHom2 != ((GLFVariableLengthCall) rec).getLkHom2()) return false;
if (lkHet != ((GLFVariableLengthCall) rec).getLkHet()) return false;
if (indelLen1 != ((GLFVariableLengthCall) rec).getIndelLen1()) return false;
if (indelLen2 != ((GLFVariableLengthCall) rec).getIndelLen2()) return false;
for (int x = 0; x < indelSeq1.length; x++)
if (indelSeq1[x] != ((GLFVariableLengthCall) rec).getIndelSeq1()[x]) return false;
for (int x = 0; x < indelSeq2.length; x++)
if (indelSeq2[x] != ((GLFVariableLengthCall) rec).getIndelSeq2()[x]) return false;
return minlikelihood == rec.getMinimumLikelihood() && size == rec.getByteSize();
}
}

View File

@ -3,14 +3,16 @@ package org.broadinstitute.sting.utils.genotype.glf;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.util.BinaryCodec;
import net.sf.samtools.util.BlockCompressedOutputStream;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.gatk.contexts.variantcontext.MutableGenotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
import org.broadinstitute.sting.utils.genotype.IndelLikelihood;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotype.CalledGenotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
import org.broadinstitute.sting.utils.genotype.IndelLikelihood;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.io.DataOutputStream;
import java.io.File;
@ -66,6 +68,9 @@ public class GLFWriter implements GLFGenotypeWriter {
// the last position written
private int lastPos = 1;
// a field for storing the RMS of the mapping qualities in a mutable variant context
public static final String RMS_MAPPING_QUAL = "RMS_MAPPING_QUAL";
/**
* The public constructor for creating a GLF object
*
@ -96,8 +101,8 @@ public class GLFWriter implements GLFGenotypeWriter {
*/
public void writeHeader(String headerText) {
this.headerText = headerText;
for (int x = 0; x < glfMagic.length; x++) {
outputBinaryCodec.writeUByte(glfMagic[x]);
for (short aGlfMagic : glfMagic) {
outputBinaryCodec.writeUByte(aGlfMagic);
}
if (!(headerText.equals(""))) {
outputBinaryCodec.writeString(headerText, true, true);
@ -175,10 +180,19 @@ public class GLFWriter implements GLFGenotypeWriter {
// calculate the RMS mapping qualities and the read depth
double rms = 0.0;
int readCount = 0;
if ( pileup != null ) {
if ( pileup != null) {
rms = calculateRMS(pileup);
readCount = pileup.size();
}
// if we can't get the rms from the read pile-up (preferred), check the tags, the VC might have it
if (genotype.hasAttribute(RMS_MAPPING_QUAL) && new Double(0.0).equals(rms))
rms = (Double)((MutableGenotype)genotype).getAttribute(RMS_MAPPING_QUAL);
// if we can't get the depth from the read pile-up (preferred), check the tags, the VC might have it
if (genotype.hasAttribute(VCFGenotypeRecord.DEPTH_KEY) && 0 == readCount)
readCount = (Integer)((MutableGenotype)genotype).getAttribute(VCFGenotypeRecord.DEPTH_KEY);
addCall(GenomeLocParser.getContigInfo(vc.getLocation().getContig()), (int)vc.getLocation().getStart(), (float) rms, ref, readCount, obj);
}

View File

@ -0,0 +1,106 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
import org.broadinstitute.sting.utils.genotype.glf.GLFSingleCall;
import org.broadinstitute.sting.utils.genotype.glf.GLFWriter;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
/**
*
* @author aaron
*
* Class VariantContextAdaptorsTest
*
* This test class exists to test input -> output of variant formats
* run through the VariantContext class.
*/
public class VariantContextAdaptorsTest extends BaseTest {
public static IndexedFastaSequenceFile seq = null;
@BeforeClass
public static void beforeClass() {
try {
seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "/reference/human_b36_both.fasta")); // TODO: make human reference use BaseTest
} catch (FileNotFoundException e) {
Assert.fail("Unable to load reference " + oneKGLocation + "/reference/human_b36_both.fasta");
}
GenomeLocParser.setupRefContigOrdering(seq);
}
/**
* this test takes a known GLF file, reads in the records (storing them into an array),
* and creates VariantContext records. These VC records are then outputted through a genotype writer,
* and then read back in off of disk and compared to the original records. This way we are positive all
* the information that encodes a GLF makes it into the VC and then out to disk.
*/
@Test
public void testVariantContextGLFToGLF() {
// our input and output files
File referenceFile = new File(validationDataLocation + "/well_formed.glf"); // our known good GLF
File tempFile = new File("temp.glf"); // our temporary GLF output -> input file
tempFile.deleteOnExit(); // delete when we're done
// create our genotype writer for GLFs
GenotypeWriter gw = GenotypeWriterFactory.create(GenotypeWriterFactory.GENOTYPE_FORMAT.GLF,tempFile);
((GLFWriter)gw).writeHeader("");
RodGLF glf = new RodGLF("myROD"); // now cycle the input file to the output file
try {
glf.initialize(referenceFile);
} catch (FileNotFoundException e) {
Assert.fail("Unable to open GLF file" + referenceFile);
}
// buffer the records we see
List<GLFSingleCall> records = new ArrayList<GLFSingleCall>();
// while we have records, make a Variant Context and output it to a GLF file
while (glf.hasNext()) {
glf.next();
records.add((GLFSingleCall)glf.mRecord); // we know they're all single calls in the reference file
VariantContext vc = VariantContextAdaptors.toVariantContext("GLF",glf);
gw.addCall(vc);
}
gw.close(); // close the file
// now reopen the file with the temp GLF file and read it back in, compare against what we first stored
glf = new RodGLF("myROD");
try {
glf.initialize(tempFile);
} catch (FileNotFoundException e) {
Assert.fail("Unable to open GLF file" + tempFile);
}
// buffer the new records we see
List<GLFSingleCall> records2 = new ArrayList<GLFSingleCall>();
// while we have records, make a Variant Context and output it to a GLF file
while (glf.hasNext()) {
glf.next();
records2.add((GLFSingleCall)glf.mRecord); // we know they're all single calls in the reference file
}
// compare sizes
Assert.assertEquals("The input GLF file doesn't contain the same number of records as we saw in the first file", records.size(),records2.size());
// now compare each record
for (int x = 0; x < records.size(); x++)
Assert.assertTrue("GLF Records were not preserved when cycling them to and from disc", records.get(x).equals(records2.get(x)));
}
}