GLF to variant context. Added some methods in GLF to aid testing; and added a test that reads GLF, converts to VC, writes GLF and reads back to compare.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3062 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3767adb0bb
commit
eafdd047f7
|
|
@ -1,12 +1,16 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
import org.broadinstitute.sting.utils.genotype.CalledGenotype;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.MutableGenotype;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.CalledGenotype;
|
||||
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.GLFSingleCall;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.GLFWriter;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -38,6 +42,8 @@ public class VariantContextAdaptors {
|
|||
adaptors.put(RodVCF.class, new RodVCFAdaptor());
|
||||
adaptors.put(VCFRecord.class, new VCFRecordAdaptor());
|
||||
adaptors.put(PlinkRod.class, new PlinkRodAdaptor());
|
||||
adaptors.put(RodGLF.class, new GLFAdaptor());
|
||||
// adaptors.put(RodGeliText.class, new GeliAdaptor());
|
||||
}
|
||||
|
||||
public static boolean canBeConvertedToVariantContext(Object variantContainingObject) {
|
||||
|
|
@ -425,4 +431,115 @@ public class VariantContextAdaptors {
|
|||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// GLF to VariantContext
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
private static class GLFAdaptor extends VCAdaptor {
|
||||
/**
|
||||
* convert to a Variant Context, given:
|
||||
* @param name the name of the ROD
|
||||
* @param input the Rod object, in this case a RodGLF
|
||||
* @return a VariantContext object
|
||||
*/
|
||||
VariantContext convert(String name, Object input) {
|
||||
if ( ! Allele.acceptableAlleleBases(((RodGLF)input).getReference()) )
|
||||
return null;
|
||||
Allele refAllele = new Allele(((RodGLF)input).getReference(), true);
|
||||
return convert(name, input, refAllele);
|
||||
}
|
||||
|
||||
/**
|
||||
* convert to a Variant Context, given:
|
||||
* @param name the name of the ROD
|
||||
* @param input the Rod object, in this case a RodGLF
|
||||
* @param refAllele the reference base as an Allele object
|
||||
* @return a VariantContext object
|
||||
*/
|
||||
VariantContext convert(String name, Object input, Allele refAllele) {
|
||||
RodGLF glf = (RodGLF)input;
|
||||
|
||||
// make sure we can convert it
|
||||
if ( glf.isSNP() || glf.isIndel()) {
|
||||
// add the reference allele
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add(refAllele);
|
||||
|
||||
// add all of the alt alleles
|
||||
for ( String alt : glf.getAlternateAlleleList() ) {
|
||||
if ( ! Allele.acceptableAlleleBases(alt) ) {
|
||||
return null;
|
||||
}
|
||||
Allele allele = new Allele(alt, false);
|
||||
if (!alleles.contains(allele)) alleles.add(allele);
|
||||
}
|
||||
|
||||
|
||||
Map<String, String> attributes = new HashMap<String, String>();
|
||||
Collection<Genotype> genotypes = new ArrayList<Genotype>();
|
||||
MutableGenotype call = new MutableGenotype(name, alleles);
|
||||
|
||||
if (glf.mRecord instanceof GLFSingleCall) {
|
||||
// transform the likelihoods from negative log (positive double values) to log values (negitive values)
|
||||
LikelihoodObject obj = new LikelihoodObject(((GLFSingleCall)glf.mRecord).getLikelihoods(), LikelihoodObject.LIKELIHOOD_TYPE.NEGATIVE_LOG);
|
||||
obj.setLikelihoodType(LikelihoodObject.LIKELIHOOD_TYPE.LOG);
|
||||
|
||||
// set the likelihoods, depth, and RMS mapping quality values
|
||||
call.putAttribute(CalledGenotype.LIKELIHOODS_ATTRIBUTE_KEY,obj.toDoubleArray());
|
||||
call.putAttribute(VCFGenotypeRecord.DEPTH_KEY,(glf.mRecord.getReadDepth()));
|
||||
call.putAttribute(GLFWriter.RMS_MAPPING_QUAL, (double) glf.mRecord.getRmsMapQ());
|
||||
} else {
|
||||
throw new UnsupportedOperationException("We don't currenly support indel calls");
|
||||
}
|
||||
|
||||
// add the call to the genotype list, and then use this list to create a VariantContext
|
||||
genotypes.add(call);
|
||||
VariantContext vc = new VariantContext(name, glf.getLocation(), alleles, genotypes, glf.getNegLog10PError(), null, attributes);
|
||||
vc.validate();
|
||||
return vc;
|
||||
} else
|
||||
return null; // can't handle anything else
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// GELI to VariantContext
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
/*
|
||||
private static class GeliAdaptor extends VCAdaptor {
|
||||
VariantContext convert(String name, Object input) {
|
||||
if (!Allele.acceptableAlleleBases(((RodGeliText) input).getReference()))
|
||||
return null;
|
||||
Allele refAllele = new Allele(((RodGeliText) input).getReference(), true);
|
||||
return convert(name, input, refAllele);
|
||||
}
|
||||
|
||||
VariantContext convert(String name, Object input, Allele refAllele) {
|
||||
RodGeliText geliText = (RodGeliText) input;
|
||||
if (geliText.isSNP() || geliText.isIndel()) {
|
||||
// add the reference allele
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add(refAllele);
|
||||
|
||||
// add all of the alt alleles
|
||||
for (String alt : geliText.getAlternateAlleleList()) {
|
||||
if (!Allele.acceptableAlleleBases(alt)) {
|
||||
return null;
|
||||
}
|
||||
alleles.add(new Allele(alt, false));
|
||||
}
|
||||
|
||||
Map<String, String> attributes = new HashMap<String, String>();
|
||||
attributes.put("ID", geliText.getName());
|
||||
Collection<Genotype> genotypes = null;
|
||||
VariantContext vc = new VariantContext(name, geliText.getLocation(), alleles, genotypes, geliText.getNegLog10PError(), null, attributes);
|
||||
vc.validate();
|
||||
return vc;
|
||||
} else
|
||||
return null; // can't handle anything else
|
||||
}
|
||||
}*/
|
||||
}
|
||||
|
|
|
|||
|
|
@ -50,9 +50,6 @@ public abstract class GLFRecord {
|
|||
protected int readDepth = 0;
|
||||
protected short rmsMapQ = 0;
|
||||
|
||||
// the size of this base structure
|
||||
protected final int baseSize = 10;
|
||||
|
||||
/** the reference base enumeration, with their short (type) values for GLF */
|
||||
public enum REF_BASE {
|
||||
X((short) 0x00),
|
||||
|
|
@ -204,19 +201,19 @@ public abstract class GLFRecord {
|
|||
* write the this record to a binary codec output.
|
||||
*
|
||||
* @param out the binary codec to write to
|
||||
* @param lastRecord the record to write
|
||||
*/
|
||||
void write(BinaryCodec out, GLFRecord lastRecord) {
|
||||
long offset = 0;
|
||||
if (lastRecord != null && lastRecord.getContig() == this.getContig())
|
||||
long offset;
|
||||
if (lastRecord != null && lastRecord.getContig().equals(this.getContig()))
|
||||
offset = this.position - lastRecord.getPosition();
|
||||
else
|
||||
offset = this.position - 1; // we start at one, we need to subtract that off
|
||||
short bite = ((short) (this.getRecordType().getReadTypeValue() << 4 | (refBase.getBaseHexValue() & 0x0f)));
|
||||
out.writeUByte((short) (this.getRecordType().getReadTypeValue() << 4 | (refBase.getBaseHexValue() & 0x0f)));
|
||||
out.writeUInt(((Long) (offset)).intValue()); // we have to subtract one, we're an offset
|
||||
long write = (long) ((long) (readDepth & 0xffffff) | (long) (this.getMinimumLikelihood() & 0xff) << 24);
|
||||
long write = ((long) (readDepth & 0xffffff) | (long) (this.getMinimumLikelihood() & 0xff) << 24);
|
||||
out.writeUInt(write);
|
||||
out.writeUByte((short) rmsMapQ);
|
||||
out.writeUByte(rmsMapQ);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -249,9 +246,9 @@ public abstract class GLFRecord {
|
|||
/**
|
||||
* find the minimum value in a set of doubles
|
||||
*
|
||||
* @param vals
|
||||
* @param vals the array of values
|
||||
*
|
||||
* @return
|
||||
* @return the minimum value
|
||||
*/
|
||||
protected static double findMin(double vals[]) {
|
||||
if (vals.length < 1) throw new StingException("findMin: an array of size < 1 was passed in");
|
||||
|
|
@ -293,5 +290,16 @@ public abstract class GLFRecord {
|
|||
* @return a short of the minimum likelihood.
|
||||
*/
|
||||
protected abstract short calculateMinLikelihood();
|
||||
|
||||
public boolean equals(GLFRecord rec) {
|
||||
return (rec != null) &&
|
||||
contig.equals(rec.getContig()) &&
|
||||
(refBase == rec.getRefBase()) &&
|
||||
(position == rec.getPosition()) &&
|
||||
(readDepth == rec.getReadDepth()) &&
|
||||
(rmsMapQ == rec.getRmsMapQ());
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -116,6 +116,16 @@ public class GLFSingleCall extends GLFRecord {
|
|||
return toCappedShort(minLikelihood);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(GLFRecord rec) {
|
||||
if (!super.equals(rec)) return false;
|
||||
if (!(rec instanceof GLFSingleCall)) return false;
|
||||
if (((GLFSingleCall) rec).getLikelihoods().length != this.likelihoods.length) return false;
|
||||
for (int x = 0; x < likelihoods.length; x++)
|
||||
if (Double.compare(likelihoods[x],((GLFSingleCall) rec).getLikelihoods()[x]) != 0) return false;
|
||||
return this.getMinimumLikelihood() == rec.getMinimumLikelihood();
|
||||
}
|
||||
|
||||
public double[] getLikelihoods() {
|
||||
return likelihoods;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -56,8 +56,6 @@ public class GLFVariableLengthCall extends GLFRecord {
|
|||
* @param contig the contig this record is on
|
||||
* @param refBase the reference base
|
||||
* @param offset the location, as an offset from the previous glf record
|
||||
* @param offset the location, as an offset from the previous glf record
|
||||
s
|
||||
* @param readDepth the read depth at the specified postion
|
||||
* @param rmsMapQ the root mean square of the mapping quality
|
||||
* @param lkHom1 the negitive log likelihood of the first homozygous indel allele, from 0 to 255
|
||||
|
|
@ -158,4 +156,19 @@ public class GLFVariableLengthCall extends GLFRecord {
|
|||
public int getIndelLen1() {
|
||||
return indelLen1;
|
||||
}
|
||||
|
||||
public boolean equals(GLFRecord rec) {
|
||||
if (!super.equals(rec)) return false;
|
||||
if (!(rec instanceof GLFVariableLengthCall)) return false;
|
||||
if (lkHom1 != ((GLFVariableLengthCall) rec).getLkHom1()) return false;
|
||||
if (lkHom2 != ((GLFVariableLengthCall) rec).getLkHom2()) return false;
|
||||
if (lkHet != ((GLFVariableLengthCall) rec).getLkHet()) return false;
|
||||
if (indelLen1 != ((GLFVariableLengthCall) rec).getIndelLen1()) return false;
|
||||
if (indelLen2 != ((GLFVariableLengthCall) rec).getIndelLen2()) return false;
|
||||
for (int x = 0; x < indelSeq1.length; x++)
|
||||
if (indelSeq1[x] != ((GLFVariableLengthCall) rec).getIndelSeq1()[x]) return false;
|
||||
for (int x = 0; x < indelSeq2.length; x++)
|
||||
if (indelSeq2[x] != ((GLFVariableLengthCall) rec).getIndelSeq2()[x]) return false;
|
||||
return minlikelihood == rec.getMinimumLikelihood() && size == rec.getByteSize();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,14 +3,16 @@ package org.broadinstitute.sting.utils.genotype.glf;
|
|||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.samtools.util.BinaryCodec;
|
||||
import net.sf.samtools.util.BlockCompressedOutputStream;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.MutableGenotype;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
|
||||
import org.broadinstitute.sting.utils.genotype.IndelLikelihood;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.CalledGenotype;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
|
||||
import org.broadinstitute.sting.utils.genotype.IndelLikelihood;
|
||||
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.File;
|
||||
|
|
@ -66,6 +68,9 @@ public class GLFWriter implements GLFGenotypeWriter {
|
|||
// the last position written
|
||||
private int lastPos = 1;
|
||||
|
||||
// a field for storing the RMS of the mapping qualities in a mutable variant context
|
||||
public static final String RMS_MAPPING_QUAL = "RMS_MAPPING_QUAL";
|
||||
|
||||
/**
|
||||
* The public constructor for creating a GLF object
|
||||
*
|
||||
|
|
@ -96,8 +101,8 @@ public class GLFWriter implements GLFGenotypeWriter {
|
|||
*/
|
||||
public void writeHeader(String headerText) {
|
||||
this.headerText = headerText;
|
||||
for (int x = 0; x < glfMagic.length; x++) {
|
||||
outputBinaryCodec.writeUByte(glfMagic[x]);
|
||||
for (short aGlfMagic : glfMagic) {
|
||||
outputBinaryCodec.writeUByte(aGlfMagic);
|
||||
}
|
||||
if (!(headerText.equals(""))) {
|
||||
outputBinaryCodec.writeString(headerText, true, true);
|
||||
|
|
@ -175,10 +180,19 @@ public class GLFWriter implements GLFGenotypeWriter {
|
|||
// calculate the RMS mapping qualities and the read depth
|
||||
double rms = 0.0;
|
||||
int readCount = 0;
|
||||
if ( pileup != null ) {
|
||||
|
||||
if ( pileup != null) {
|
||||
rms = calculateRMS(pileup);
|
||||
readCount = pileup.size();
|
||||
}
|
||||
// if we can't get the rms from the read pile-up (preferred), check the tags, the VC might have it
|
||||
if (genotype.hasAttribute(RMS_MAPPING_QUAL) && new Double(0.0).equals(rms))
|
||||
rms = (Double)((MutableGenotype)genotype).getAttribute(RMS_MAPPING_QUAL);
|
||||
|
||||
// if we can't get the depth from the read pile-up (preferred), check the tags, the VC might have it
|
||||
if (genotype.hasAttribute(VCFGenotypeRecord.DEPTH_KEY) && 0 == readCount)
|
||||
readCount = (Integer)((MutableGenotype)genotype).getAttribute(VCFGenotypeRecord.DEPTH_KEY);
|
||||
|
||||
addCall(GenomeLocParser.getContigInfo(vc.getLocation().getContig()), (int)vc.getLocation().getStart(), (float) rms, ref, readCount, obj);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,106 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
|
||||
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.GLFSingleCall;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.GLFWriter;
|
||||
import org.junit.Assert;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class VariantContextAdaptorsTest
|
||||
*
|
||||
* This test class exists to test input -> output of variant formats
|
||||
* run through the VariantContext class.
|
||||
*/
|
||||
public class VariantContextAdaptorsTest extends BaseTest {
|
||||
public static IndexedFastaSequenceFile seq = null;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() {
|
||||
try {
|
||||
seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "/reference/human_b36_both.fasta")); // TODO: make human reference use BaseTest
|
||||
} catch (FileNotFoundException e) {
|
||||
Assert.fail("Unable to load reference " + oneKGLocation + "/reference/human_b36_both.fasta");
|
||||
}
|
||||
GenomeLocParser.setupRefContigOrdering(seq);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* this test takes a known GLF file, reads in the records (storing them into an array),
|
||||
* and creates VariantContext records. These VC records are then outputted through a genotype writer,
|
||||
* and then read back in off of disk and compared to the original records. This way we are positive all
|
||||
* the information that encodes a GLF makes it into the VC and then out to disk.
|
||||
*/
|
||||
@Test
|
||||
public void testVariantContextGLFToGLF() {
|
||||
|
||||
// our input and output files
|
||||
File referenceFile = new File(validationDataLocation + "/well_formed.glf"); // our known good GLF
|
||||
File tempFile = new File("temp.glf"); // our temporary GLF output -> input file
|
||||
tempFile.deleteOnExit(); // delete when we're done
|
||||
|
||||
// create our genotype writer for GLFs
|
||||
GenotypeWriter gw = GenotypeWriterFactory.create(GenotypeWriterFactory.GENOTYPE_FORMAT.GLF,tempFile);
|
||||
((GLFWriter)gw).writeHeader("");
|
||||
|
||||
RodGLF glf = new RodGLF("myROD"); // now cycle the input file to the output file
|
||||
try {
|
||||
glf.initialize(referenceFile);
|
||||
} catch (FileNotFoundException e) {
|
||||
Assert.fail("Unable to open GLF file" + referenceFile);
|
||||
}
|
||||
|
||||
// buffer the records we see
|
||||
List<GLFSingleCall> records = new ArrayList<GLFSingleCall>();
|
||||
|
||||
// while we have records, make a Variant Context and output it to a GLF file
|
||||
while (glf.hasNext()) {
|
||||
glf.next();
|
||||
records.add((GLFSingleCall)glf.mRecord); // we know they're all single calls in the reference file
|
||||
VariantContext vc = VariantContextAdaptors.toVariantContext("GLF",glf);
|
||||
gw.addCall(vc);
|
||||
}
|
||||
gw.close(); // close the file
|
||||
|
||||
|
||||
// now reopen the file with the temp GLF file and read it back in, compare against what we first stored
|
||||
glf = new RodGLF("myROD");
|
||||
try {
|
||||
glf.initialize(tempFile);
|
||||
} catch (FileNotFoundException e) {
|
||||
Assert.fail("Unable to open GLF file" + tempFile);
|
||||
}
|
||||
|
||||
// buffer the new records we see
|
||||
List<GLFSingleCall> records2 = new ArrayList<GLFSingleCall>();
|
||||
|
||||
// while we have records, make a Variant Context and output it to a GLF file
|
||||
while (glf.hasNext()) {
|
||||
glf.next();
|
||||
records2.add((GLFSingleCall)glf.mRecord); // we know they're all single calls in the reference file
|
||||
}
|
||||
|
||||
// compare sizes
|
||||
Assert.assertEquals("The input GLF file doesn't contain the same number of records as we saw in the first file", records.size(),records2.size());
|
||||
|
||||
// now compare each record
|
||||
for (int x = 0; x < records.size(); x++)
|
||||
Assert.assertTrue("GLF Records were not preserved when cycling them to and from disc", records.get(x).equals(records2.get(x)));
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue