From eafdd047f7859391347257f20b012934bbf83186 Mon Sep 17 00:00:00 2001 From: aaron Date: Tue, 23 Mar 2010 03:43:25 +0000 Subject: [PATCH] GLF to variant context. Added some methods in GLF to aid testing; and added a test that reads GLF, converts to VC, writes GLF and reads back to compare. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3062 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/refdata/VariantContextAdaptors.java | 127 +++++++++++++++++- .../sting/utils/genotype/glf/GLFRecord.java | 28 ++-- .../utils/genotype/glf/GLFSingleCall.java | 10 ++ .../genotype/glf/GLFVariableLengthCall.java | 17 ++- .../sting/utils/genotype/glf/GLFWriter.java | 32 +++-- .../refdata/VariantContextAdaptorsTest.java | 106 +++++++++++++++ 6 files changed, 294 insertions(+), 26 deletions(-) create mode 100644 java/test/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptorsTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 31f7d3ee1..697874886 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -1,12 +1,16 @@ package org.broadinstitute.sting.gatk.refdata; -import org.broadinstitute.sting.utils.genotype.vcf.*; -import org.broadinstitute.sting.utils.genotype.CalledGenotype; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; +import org.broadinstitute.sting.gatk.contexts.variantcontext.MutableGenotype; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.CalledGenotype; +import org.broadinstitute.sting.utils.genotype.LikelihoodObject; +import org.broadinstitute.sting.utils.genotype.glf.GLFSingleCall; +import org.broadinstitute.sting.utils.genotype.glf.GLFWriter; +import org.broadinstitute.sting.utils.genotype.vcf.*; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import java.util.*; @@ -38,6 +42,8 @@ public class VariantContextAdaptors { adaptors.put(RodVCF.class, new RodVCFAdaptor()); adaptors.put(VCFRecord.class, new VCFRecordAdaptor()); adaptors.put(PlinkRod.class, new PlinkRodAdaptor()); + adaptors.put(RodGLF.class, new GLFAdaptor()); + // adaptors.put(RodGeliText.class, new GeliAdaptor()); } public static boolean canBeConvertedToVariantContext(Object variantContainingObject) { @@ -425,4 +431,115 @@ public class VariantContextAdaptors { } } + // -------------------------------------------------------------------------------------------------------------- + // + // GLF to VariantContext + // + // -------------------------------------------------------------------------------------------------------------- + private static class GLFAdaptor extends VCAdaptor { + /** + * convert to a Variant Context, given: + * @param name the name of the ROD + * @param input the Rod object, in this case a RodGLF + * @return a VariantContext object + */ + VariantContext convert(String name, Object input) { + if ( ! Allele.acceptableAlleleBases(((RodGLF)input).getReference()) ) + return null; + Allele refAllele = new Allele(((RodGLF)input).getReference(), true); + return convert(name, input, refAllele); + } + + /** + * convert to a Variant Context, given: + * @param name the name of the ROD + * @param input the Rod object, in this case a RodGLF + * @param refAllele the reference base as an Allele object + * @return a VariantContext object + */ + VariantContext convert(String name, Object input, Allele refAllele) { + RodGLF glf = (RodGLF)input; + + // make sure we can convert it + if ( glf.isSNP() || glf.isIndel()) { + // add the reference allele + List alleles = new ArrayList(); + alleles.add(refAllele); + + // add all of the alt alleles + for ( String alt : glf.getAlternateAlleleList() ) { + if ( ! Allele.acceptableAlleleBases(alt) ) { + return null; + } + Allele allele = new Allele(alt, false); + if (!alleles.contains(allele)) alleles.add(allele); + } + + + Map attributes = new HashMap(); + Collection genotypes = new ArrayList(); + MutableGenotype call = new MutableGenotype(name, alleles); + + if (glf.mRecord instanceof GLFSingleCall) { + // transform the likelihoods from negative log (positive double values) to log values (negitive values) + LikelihoodObject obj = new LikelihoodObject(((GLFSingleCall)glf.mRecord).getLikelihoods(), LikelihoodObject.LIKELIHOOD_TYPE.NEGATIVE_LOG); + obj.setLikelihoodType(LikelihoodObject.LIKELIHOOD_TYPE.LOG); + + // set the likelihoods, depth, and RMS mapping quality values + call.putAttribute(CalledGenotype.LIKELIHOODS_ATTRIBUTE_KEY,obj.toDoubleArray()); + call.putAttribute(VCFGenotypeRecord.DEPTH_KEY,(glf.mRecord.getReadDepth())); + call.putAttribute(GLFWriter.RMS_MAPPING_QUAL, (double) glf.mRecord.getRmsMapQ()); + } else { + throw new UnsupportedOperationException("We don't currenly support indel calls"); + } + + // add the call to the genotype list, and then use this list to create a VariantContext + genotypes.add(call); + VariantContext vc = new VariantContext(name, glf.getLocation(), alleles, genotypes, glf.getNegLog10PError(), null, attributes); + vc.validate(); + return vc; + } else + return null; // can't handle anything else + } + } + + // -------------------------------------------------------------------------------------------------------------- + // + // GELI to VariantContext + // + // -------------------------------------------------------------------------------------------------------------- +/* + private static class GeliAdaptor extends VCAdaptor { + VariantContext convert(String name, Object input) { + if (!Allele.acceptableAlleleBases(((RodGeliText) input).getReference())) + return null; + Allele refAllele = new Allele(((RodGeliText) input).getReference(), true); + return convert(name, input, refAllele); + } + + VariantContext convert(String name, Object input, Allele refAllele) { + RodGeliText geliText = (RodGeliText) input; + if (geliText.isSNP() || geliText.isIndel()) { + // add the reference allele + List alleles = new ArrayList(); + alleles.add(refAllele); + + // add all of the alt alleles + for (String alt : geliText.getAlternateAlleleList()) { + if (!Allele.acceptableAlleleBases(alt)) { + return null; + } + alleles.add(new Allele(alt, false)); + } + + Map attributes = new HashMap(); + attributes.put("ID", geliText.getName()); + Collection genotypes = null; + VariantContext vc = new VariantContext(name, geliText.getLocation(), alleles, genotypes, geliText.getNegLog10PError(), null, attributes); + vc.validate(); + return vc; + } else + return null; // can't handle anything else + } + }*/ } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java index 2b7f61cd6..a460d897e 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java @@ -50,9 +50,6 @@ public abstract class GLFRecord { protected int readDepth = 0; protected short rmsMapQ = 0; - // the size of this base structure - protected final int baseSize = 10; - /** the reference base enumeration, with their short (type) values for GLF */ public enum REF_BASE { X((short) 0x00), @@ -204,19 +201,19 @@ public abstract class GLFRecord { * write the this record to a binary codec output. * * @param out the binary codec to write to + * @param lastRecord the record to write */ void write(BinaryCodec out, GLFRecord lastRecord) { - long offset = 0; - if (lastRecord != null && lastRecord.getContig() == this.getContig()) + long offset; + if (lastRecord != null && lastRecord.getContig().equals(this.getContig())) offset = this.position - lastRecord.getPosition(); else offset = this.position - 1; // we start at one, we need to subtract that off - short bite = ((short) (this.getRecordType().getReadTypeValue() << 4 | (refBase.getBaseHexValue() & 0x0f))); out.writeUByte((short) (this.getRecordType().getReadTypeValue() << 4 | (refBase.getBaseHexValue() & 0x0f))); out.writeUInt(((Long) (offset)).intValue()); // we have to subtract one, we're an offset - long write = (long) ((long) (readDepth & 0xffffff) | (long) (this.getMinimumLikelihood() & 0xff) << 24); + long write = ((long) (readDepth & 0xffffff) | (long) (this.getMinimumLikelihood() & 0xff) << 24); out.writeUInt(write); - out.writeUByte((short) rmsMapQ); + out.writeUByte(rmsMapQ); } /** @@ -249,9 +246,9 @@ public abstract class GLFRecord { /** * find the minimum value in a set of doubles * - * @param vals + * @param vals the array of values * - * @return + * @return the minimum value */ protected static double findMin(double vals[]) { if (vals.length < 1) throw new StingException("findMin: an array of size < 1 was passed in"); @@ -293,5 +290,16 @@ public abstract class GLFRecord { * @return a short of the minimum likelihood. */ protected abstract short calculateMinLikelihood(); + + public boolean equals(GLFRecord rec) { + return (rec != null) && + contig.equals(rec.getContig()) && + (refBase == rec.getRefBase()) && + (position == rec.getPosition()) && + (readDepth == rec.getReadDepth()) && + (rmsMapQ == rec.getRmsMapQ()); + } + + } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFSingleCall.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFSingleCall.java index dd3dfbb70..2f52fc193 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFSingleCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFSingleCall.java @@ -116,6 +116,16 @@ public class GLFSingleCall extends GLFRecord { return toCappedShort(minLikelihood); } + @Override + public boolean equals(GLFRecord rec) { + if (!super.equals(rec)) return false; + if (!(rec instanceof GLFSingleCall)) return false; + if (((GLFSingleCall) rec).getLikelihoods().length != this.likelihoods.length) return false; + for (int x = 0; x < likelihoods.length; x++) + if (Double.compare(likelihoods[x],((GLFSingleCall) rec).getLikelihoods()[x]) != 0) return false; + return this.getMinimumLikelihood() == rec.getMinimumLikelihood(); + } + public double[] getLikelihoods() { return likelihoods; } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFVariableLengthCall.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFVariableLengthCall.java index 2276c13d6..d670ff4d0 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFVariableLengthCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFVariableLengthCall.java @@ -56,8 +56,6 @@ public class GLFVariableLengthCall extends GLFRecord { * @param contig the contig this record is on * @param refBase the reference base * @param offset the location, as an offset from the previous glf record - * @param offset the location, as an offset from the previous glf record - s * @param readDepth the read depth at the specified postion * @param rmsMapQ the root mean square of the mapping quality * @param lkHom1 the negitive log likelihood of the first homozygous indel allele, from 0 to 255 @@ -158,4 +156,19 @@ public class GLFVariableLengthCall extends GLFRecord { public int getIndelLen1() { return indelLen1; } + + public boolean equals(GLFRecord rec) { + if (!super.equals(rec)) return false; + if (!(rec instanceof GLFVariableLengthCall)) return false; + if (lkHom1 != ((GLFVariableLengthCall) rec).getLkHom1()) return false; + if (lkHom2 != ((GLFVariableLengthCall) rec).getLkHom2()) return false; + if (lkHet != ((GLFVariableLengthCall) rec).getLkHet()) return false; + if (indelLen1 != ((GLFVariableLengthCall) rec).getIndelLen1()) return false; + if (indelLen2 != ((GLFVariableLengthCall) rec).getIndelLen2()) return false; + for (int x = 0; x < indelSeq1.length; x++) + if (indelSeq1[x] != ((GLFVariableLengthCall) rec).getIndelSeq1()[x]) return false; + for (int x = 0; x < indelSeq2.length; x++) + if (indelSeq2[x] != ((GLFVariableLengthCall) rec).getIndelSeq2()[x]) return false; + return minlikelihood == rec.getMinimumLikelihood() && size == rec.getByteSize(); + } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java index 51433e61f..1b57353dc 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java @@ -3,14 +3,16 @@ package org.broadinstitute.sting.utils.genotype.glf; import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.util.BinaryCodec; import net.sf.samtools.util.BlockCompressedOutputStream; -import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.gatk.contexts.variantcontext.MutableGenotype; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.genotype.LikelihoodObject; -import org.broadinstitute.sting.utils.genotype.IndelLikelihood; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.genotype.CalledGenotype; -import org.broadinstitute.sting.gatk.contexts.variantcontext.*; +import org.broadinstitute.sting.utils.genotype.IndelLikelihood; +import org.broadinstitute.sting.utils.genotype.LikelihoodObject; +import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import java.io.DataOutputStream; import java.io.File; @@ -66,6 +68,9 @@ public class GLFWriter implements GLFGenotypeWriter { // the last position written private int lastPos = 1; + // a field for storing the RMS of the mapping qualities in a mutable variant context + public static final String RMS_MAPPING_QUAL = "RMS_MAPPING_QUAL"; + /** * The public constructor for creating a GLF object * @@ -96,8 +101,8 @@ public class GLFWriter implements GLFGenotypeWriter { */ public void writeHeader(String headerText) { this.headerText = headerText; - for (int x = 0; x < glfMagic.length; x++) { - outputBinaryCodec.writeUByte(glfMagic[x]); + for (short aGlfMagic : glfMagic) { + outputBinaryCodec.writeUByte(aGlfMagic); } if (!(headerText.equals(""))) { outputBinaryCodec.writeString(headerText, true, true); @@ -175,10 +180,19 @@ public class GLFWriter implements GLFGenotypeWriter { // calculate the RMS mapping qualities and the read depth double rms = 0.0; int readCount = 0; - if ( pileup != null ) { + + if ( pileup != null) { rms = calculateRMS(pileup); readCount = pileup.size(); } + // if we can't get the rms from the read pile-up (preferred), check the tags, the VC might have it + if (genotype.hasAttribute(RMS_MAPPING_QUAL) && new Double(0.0).equals(rms)) + rms = (Double)((MutableGenotype)genotype).getAttribute(RMS_MAPPING_QUAL); + + // if we can't get the depth from the read pile-up (preferred), check the tags, the VC might have it + if (genotype.hasAttribute(VCFGenotypeRecord.DEPTH_KEY) && 0 == readCount) + readCount = (Integer)((MutableGenotype)genotype).getAttribute(VCFGenotypeRecord.DEPTH_KEY); + addCall(GenomeLocParser.getContigInfo(vc.getLocation().getContig()), (int)vc.getLocation().getStart(), (float) rms, ref, readCount, obj); } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptorsTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptorsTest.java new file mode 100644 index 000000000..7243bdf5a --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptorsTest.java @@ -0,0 +1,106 @@ +package org.broadinstitute.sting.gatk.refdata; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.genotype.GenotypeWriter; +import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; +import org.broadinstitute.sting.utils.genotype.glf.GLFSingleCall; +import org.broadinstitute.sting.utils.genotype.glf.GLFWriter; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.List; + + +/** + * + * @author aaron + * + * Class VariantContextAdaptorsTest + * + * This test class exists to test input -> output of variant formats + * run through the VariantContext class. + */ +public class VariantContextAdaptorsTest extends BaseTest { + public static IndexedFastaSequenceFile seq = null; + + @BeforeClass + public static void beforeClass() { + try { + seq = new IndexedFastaSequenceFile(new File(oneKGLocation + "/reference/human_b36_both.fasta")); // TODO: make human reference use BaseTest + } catch (FileNotFoundException e) { + Assert.fail("Unable to load reference " + oneKGLocation + "/reference/human_b36_both.fasta"); + } + GenomeLocParser.setupRefContigOrdering(seq); + } + + + /** + * this test takes a known GLF file, reads in the records (storing them into an array), + * and creates VariantContext records. These VC records are then outputted through a genotype writer, + * and then read back in off of disk and compared to the original records. This way we are positive all + * the information that encodes a GLF makes it into the VC and then out to disk. + */ + @Test + public void testVariantContextGLFToGLF() { + + // our input and output files + File referenceFile = new File(validationDataLocation + "/well_formed.glf"); // our known good GLF + File tempFile = new File("temp.glf"); // our temporary GLF output -> input file + tempFile.deleteOnExit(); // delete when we're done + + // create our genotype writer for GLFs + GenotypeWriter gw = GenotypeWriterFactory.create(GenotypeWriterFactory.GENOTYPE_FORMAT.GLF,tempFile); + ((GLFWriter)gw).writeHeader(""); + + RodGLF glf = new RodGLF("myROD"); // now cycle the input file to the output file + try { + glf.initialize(referenceFile); + } catch (FileNotFoundException e) { + Assert.fail("Unable to open GLF file" + referenceFile); + } + + // buffer the records we see + List records = new ArrayList(); + + // while we have records, make a Variant Context and output it to a GLF file + while (glf.hasNext()) { + glf.next(); + records.add((GLFSingleCall)glf.mRecord); // we know they're all single calls in the reference file + VariantContext vc = VariantContextAdaptors.toVariantContext("GLF",glf); + gw.addCall(vc); + } + gw.close(); // close the file + + + // now reopen the file with the temp GLF file and read it back in, compare against what we first stored + glf = new RodGLF("myROD"); + try { + glf.initialize(tempFile); + } catch (FileNotFoundException e) { + Assert.fail("Unable to open GLF file" + tempFile); + } + + // buffer the new records we see + List records2 = new ArrayList(); + + // while we have records, make a Variant Context and output it to a GLF file + while (glf.hasNext()) { + glf.next(); + records2.add((GLFSingleCall)glf.mRecord); // we know they're all single calls in the reference file + } + + // compare sizes + Assert.assertEquals("The input GLF file doesn't contain the same number of records as we saw in the first file", records.size(),records2.size()); + + // now compare each record + for (int x = 0; x < records.size(); x++) + Assert.assertTrue("GLF Records were not preserved when cycling them to and from disc", records.get(x).equals(records2.get(x))); + } +}