Geli to variant context.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3063 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-03-23 06:45:29 +00:00
parent eafdd047f7
commit a69b8555dd
4 changed files with 199 additions and 40 deletions

View File

@ -39,7 +39,7 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation
public enum Genotype_Strings { public enum Genotype_Strings {
AA, AC, AG, AT, CC, CG, CT, GG, GT, TT AA, AC, AG, AT, CC, CG, CT, GG, GT, TT
} }
public GenomeLoc loc; public GenomeLoc loc;
public char refBase = 'N'; public char refBase = 'N';
public int depth; public int depth;
@ -47,7 +47,7 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation
public String bestGenotype = "NN"; public String bestGenotype = "NN";
public double lodBtr; public double lodBtr;
public double lodBtnb; public double lodBtnb;
public double[] genotypeLikelihoods = new double[10]; public double[] genotypePosteriors = new double[10];
public RodGeliText(final String name) { public RodGeliText(final String name) {
super(name); super(name);
@ -75,7 +75,7 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation
lodBtnb = Double.valueOf(parts[7]); lodBtnb = Double.valueOf(parts[7]);
for (int pieceIndex = 8, offset = 0; pieceIndex < 18; pieceIndex++, offset++) { for (int pieceIndex = 8, offset = 0; pieceIndex < 18; pieceIndex++, offset++) {
genotypeLikelihoods[offset] = Double.valueOf(parts[pieceIndex]); genotypePosteriors[offset] = Double.valueOf(parts[pieceIndex]);
} }
return true; return true;
@ -94,16 +94,16 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation
bestGenotype, bestGenotype,
lodBtr, lodBtr,
lodBtnb, lodBtnb,
genotypeLikelihoods[0], genotypePosteriors[0],
genotypeLikelihoods[1], genotypePosteriors[1],
genotypeLikelihoods[2], genotypePosteriors[2],
genotypeLikelihoods[3], genotypePosteriors[3],
genotypeLikelihoods[4], genotypePosteriors[4],
genotypeLikelihoods[5], genotypePosteriors[5],
genotypeLikelihoods[6], genotypePosteriors[6],
genotypeLikelihoods[7], genotypePosteriors[7],
genotypeLikelihoods[8], genotypePosteriors[8],
genotypeLikelihoods[9] genotypePosteriors[9]
); );
} }
@ -307,13 +307,13 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation
return lodBtnb; return lodBtnb;
} }
public double[] getGenotypeLikelihoods() { public double[] getGenotypePosteriors() {
return genotypeLikelihoods; return genotypePosteriors;
} }
public void adjustLikelihoods(double[] likelihoods) { public void adjustLikelihoods(double[] likelihoods) {
for (int likelihoodIndex = 0; likelihoodIndex < likelihoods.length; likelihoodIndex++) { for (int likelihoodIndex = 0; likelihoodIndex < likelihoods.length; likelihoodIndex++) {
genotypeLikelihoods[likelihoodIndex] += likelihoods[likelihoodIndex]; genotypePosteriors[likelihoodIndex] += likelihoods[likelihoodIndex];
} }
String bestGenotype = "NN"; String bestGenotype = "NN";
@ -322,23 +322,23 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation
double refLikelihood = Double.NEGATIVE_INFINITY; double refLikelihood = Double.NEGATIVE_INFINITY;
for (int likelihoodIndex = 0; likelihoodIndex < likelihoods.length; likelihoodIndex++) { for (int likelihoodIndex = 0; likelihoodIndex < likelihoods.length; likelihoodIndex++) {
if (genotypeLikelihoods[likelihoodIndex] > bestLikelihood) { if (genotypePosteriors[likelihoodIndex] > bestLikelihood) {
bestLikelihood = genotypeLikelihoods[likelihoodIndex]; bestLikelihood = genotypePosteriors[likelihoodIndex];
bestGenotype = Genotype_Strings.values()[likelihoodIndex].toString(); bestGenotype = Genotype_Strings.values()[likelihoodIndex].toString();
} }
} }
for (int likelihoodIndex = 0; likelihoodIndex < likelihoods.length; likelihoodIndex++) { for (int likelihoodIndex = 0; likelihoodIndex < likelihoods.length; likelihoodIndex++) {
if (genotypeLikelihoods[likelihoodIndex] > nextBestLikelihood && genotypeLikelihoods[likelihoodIndex] < bestLikelihood) { if (genotypePosteriors[likelihoodIndex] > nextBestLikelihood && genotypePosteriors[likelihoodIndex] < bestLikelihood) {
nextBestLikelihood = genotypeLikelihoods[likelihoodIndex]; nextBestLikelihood = genotypePosteriors[likelihoodIndex];
} }
} }
for (int likelihoodIndex = 0; likelihoodIndex < likelihoods.length; likelihoodIndex++) { for (int likelihoodIndex = 0; likelihoodIndex < likelihoods.length; likelihoodIndex++) {
if (refBase == Genotype_Strings.values()[likelihoodIndex].toString().charAt(0) && if (refBase == Genotype_Strings.values()[likelihoodIndex].toString().charAt(0) &&
refBase == Genotype_Strings.values()[likelihoodIndex].toString().charAt(1)) { refBase == Genotype_Strings.values()[likelihoodIndex].toString().charAt(1)) {
refLikelihood = genotypeLikelihoods[likelihoodIndex]; refLikelihood = genotypePosteriors[likelihoodIndex];
} }
} }
@ -346,4 +346,17 @@ public class RodGeliText extends BasicReferenceOrderedDatum implements Variation
this.lodBtr = (bestLikelihood - refLikelihood); this.lodBtr = (bestLikelihood - refLikelihood);
this.lodBtnb = (bestLikelihood - nextBestLikelihood); this.lodBtnb = (bestLikelihood - nextBestLikelihood);
} }
public boolean equals(RodGeliText other) {
if (genotypePosteriors.length != genotypePosteriors.length) return false;
for (int x = 0; x < genotypePosteriors.length; x++)
if (Double.compare(genotypePosteriors[x],other.genotypePosteriors[x])!=0) return false;
return (loc.equals(other) &&
refBase == other.refBase &&
depth == other.depth &&
maxMappingQuality == other.maxMappingQuality &&
bestGenotype.equals(other.bestGenotype) &&
Double.compare(lodBtr,other.lodBtr) == 0&&
Double.compare(lodBtnb,other.lodBtr) == 0);
}
} }

View File

@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.CalledGenotype; import org.broadinstitute.sting.utils.genotype.CalledGenotype;
import org.broadinstitute.sting.utils.genotype.LikelihoodObject; import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
import org.broadinstitute.sting.utils.genotype.geli.GeliTextWriter;
import org.broadinstitute.sting.utils.genotype.glf.GLFSingleCall; import org.broadinstitute.sting.utils.genotype.glf.GLFSingleCall;
import org.broadinstitute.sting.utils.genotype.glf.GLFWriter; import org.broadinstitute.sting.utils.genotype.glf.GLFWriter;
import org.broadinstitute.sting.utils.genotype.vcf.*; import org.broadinstitute.sting.utils.genotype.vcf.*;
@ -43,7 +44,7 @@ public class VariantContextAdaptors {
adaptors.put(VCFRecord.class, new VCFRecordAdaptor()); adaptors.put(VCFRecord.class, new VCFRecordAdaptor());
adaptors.put(PlinkRod.class, new PlinkRodAdaptor()); adaptors.put(PlinkRod.class, new PlinkRodAdaptor());
adaptors.put(RodGLF.class, new GLFAdaptor()); adaptors.put(RodGLF.class, new GLFAdaptor());
// adaptors.put(RodGeliText.class, new GeliAdaptor()); adaptors.put(RodGeliText.class, new GeliAdaptor());
} }
public static boolean canBeConvertedToVariantContext(Object variantContainingObject) { public static boolean canBeConvertedToVariantContext(Object variantContainingObject) {
@ -508,38 +509,63 @@ public class VariantContextAdaptors {
// GELI to VariantContext // GELI to VariantContext
// //
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
/*
private static class GeliAdaptor extends VCAdaptor { private static class GeliAdaptor extends VCAdaptor {
/**
* convert to a Variant Context, given:
* @param name the name of the ROD
* @param input the Rod object, in this case a RodGeliText
* @return a VariantContext object
*/
VariantContext convert(String name, Object input) { VariantContext convert(String name, Object input) {
if (!Allele.acceptableAlleleBases(((RodGeliText) input).getReference())) if ( ! Allele.acceptableAlleleBases(((RodGeliText)input).getReference()) )
return null; return null;
Allele refAllele = new Allele(((RodGeliText) input).getReference(), true); Allele refAllele = new Allele(((RodGeliText)input).getReference(), true);
return convert(name, input, refAllele); return convert(name, input, refAllele);
} }
/**
* convert to a Variant Context, given:
* @param name the name of the ROD
* @param input the Rod object, in this case a RodGeliText
* @param refAllele the reference base as an Allele object
* @return a VariantContext object
*/
VariantContext convert(String name, Object input, Allele refAllele) { VariantContext convert(String name, Object input, Allele refAllele) {
RodGeliText geliText = (RodGeliText) input; RodGeliText geli = (RodGeliText)input;
if (geliText.isSNP() || geliText.isIndel()) {
// make sure we can convert it
if ( geli.isSNP() || geli.isIndel()) {
// add the reference allele // add the reference allele
List<Allele> alleles = new ArrayList<Allele>(); List<Allele> alleles = new ArrayList<Allele>();
alleles.add(refAllele); alleles.add(refAllele);
// add all of the alt alleles // add all of the alt alleles
for (String alt : geliText.getAlternateAlleleList()) { for ( String alt : geli.getAlternateAlleleList() ) {
if (!Allele.acceptableAlleleBases(alt)) { if ( ! Allele.acceptableAlleleBases(alt) ) {
return null; return null;
} }
alleles.add(new Allele(alt, false)); Allele allele = new Allele(alt, false);
if (!alleles.contains(allele)) alleles.add(allele);
} }
Map<String, String> attributes = new HashMap<String, String>(); Map<String, String> attributes = new HashMap<String, String>();
attributes.put("ID", geliText.getName()); Collection<Genotype> genotypes = new ArrayList<Genotype>();
Collection<Genotype> genotypes = null; MutableGenotype call = new MutableGenotype(name, alleles);
VariantContext vc = new VariantContext(name, geliText.getLocation(), alleles, genotypes, geliText.getNegLog10PError(), null, attributes);
// set the likelihoods, depth, and RMS mapping quality values
call.putAttribute(CalledGenotype.POSTERIORS_ATTRIBUTE_KEY,geli.genotypePosteriors);
call.putAttribute(GeliTextWriter.MAXIMUM_MAPPING_QUALITY_ATTRIBUTE_KEY,geli.getMaxMappingQuality());
call.putAttribute(GeliTextWriter.READ_COUNT_ATTRIBUTE_KEY,geli.depth);
// add the call to the genotype list, and then use this list to create a VariantContext
genotypes.add(call);
VariantContext vc = new VariantContext(name, geli.getLocation(), alleles, genotypes, geli.getNegLog10PError(), null, attributes);
vc.validate(); vc.validate();
return vc; return vc;
} else } else
return null; // can't handle anything else return null; // can't handle anything else
} }
}*/ }
} }

View File

@ -1,22 +1,23 @@
package org.broadinstitute.sting.utils.genotype.geli; package org.broadinstitute.sting.utils.genotype.geli;
import edu.mit.broad.picard.genotype.geli.GenotypeLikelihoods;
import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.genotype.CalledGenotype;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.genotype.*; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.gatk.contexts.variantcontext.*;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.PrintStream; import java.io.PrintStream;
import java.io.PrintWriter; import java.io.PrintWriter;
import java.util.Arrays;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import edu.mit.broad.picard.genotype.geli.GenotypeLikelihoods;
/** /**
* @author aaron * @author aaron
@ -29,6 +30,11 @@ public class GeliTextWriter implements GeliGenotypeWriter {
// where we write to // where we write to
PrintWriter mWriter; PrintWriter mWriter;
// used to store the max mapping quality as a field in variant contexts
public static final String MAXIMUM_MAPPING_QUALITY_ATTRIBUTE_KEY = "MAXIMUM_MAPPING_QUALITY";
// used to store the max mapping quality as a field in variant contexts
public static final String READ_COUNT_ATTRIBUTE_KEY = "READ_COUNT";
/** /**
* create a geli text writer * create a geli text writer
* *
@ -105,6 +111,12 @@ public class GeliTextWriter implements GeliGenotypeWriter {
maxMappingQual = p.getMappingQual(); maxMappingQual = p.getMappingQual();
} }
} }
// if we've stored the max mapping qual value in the genotype get it there
if (maxMappingQual == 0 && genotype.hasAttribute(MAXIMUM_MAPPING_QUALITY_ATTRIBUTE_KEY))
maxMappingQual = (double)genotype.getAttributeAsInt(MAXIMUM_MAPPING_QUALITY_ATTRIBUTE_KEY);
// if we've stored the read count value in the genotype get it there
if (readCount == 0 && genotype.hasAttribute(READ_COUNT_ATTRIBUTE_KEY))
readCount = genotype.getAttributeAsInt(READ_COUNT_ATTRIBUTE_KEY);
ArrayList<Character> alleles = new ArrayList<Character>(); ArrayList<Character> alleles = new ArrayList<Character>();
for ( Allele a : genotype.getAlleles() ) for ( Allele a : genotype.getAlleles() )

View File

@ -1,11 +1,13 @@
package org.broadinstitute.sting.gatk.refdata; package org.broadinstitute.sting.gatk.refdata;
import org.broad.tribble.util.AsciiLineReader;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.genotype.GenotypeWriter; import org.broadinstitute.sting.utils.genotype.GenotypeWriter;
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
import org.broadinstitute.sting.utils.genotype.geli.GeliTextWriter;
import org.broadinstitute.sting.utils.genotype.glf.GLFSingleCall; import org.broadinstitute.sting.utils.genotype.glf.GLFSingleCall;
import org.broadinstitute.sting.utils.genotype.glf.GLFWriter; import org.broadinstitute.sting.utils.genotype.glf.GLFWriter;
import org.junit.Assert; import org.junit.Assert;
@ -13,7 +15,9 @@ import org.junit.BeforeClass;
import org.junit.Test; import org.junit.Test;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -103,4 +107,108 @@ public class VariantContextAdaptorsTest extends BaseTest {
for (int x = 0; x < records.size(); x++) for (int x = 0; x < records.size(); x++)
Assert.assertTrue("GLF Records were not preserved when cycling them to and from disc", records.get(x).equals(records2.get(x))); Assert.assertTrue("GLF Records were not preserved when cycling them to and from disc", records.get(x).equals(records2.get(x)));
} }
/**
* this test takes a known Geli file, reads in the records (storing them into an array),
* and creates VariantContext records. These VC records are then outputted through a genotype writer,
* and then read back in off of disk and compared to the original records. This way we are positive all
* the information that encodes a Geli makes it into the VC and then out to disk.
*
* // TODO: this is a mess, clean it up
*/
@Test
public void testVariantContextGeliToGeli() {
// our input and output files
File knownFile = new File(validationDataLocation + "/well_formed.geli"); // our known good GLF
File tempFile = new File("temp.geli"); // our temporary GLF output -> input file
tempFile.deleteOnExit(); // delete when we're done
// create our genotype writer for GLFs
GenotypeWriter gw = GenotypeWriterFactory.create(GenotypeWriterFactory.GENOTYPE_FORMAT.GELI,tempFile);
((GeliTextWriter)gw).writeHeader(null); // the write header command ignores the parameter
RodGeliText geliText = new RodGeliText("myROD"); // now cycle the input file to the output file
// buffer the records we see
List<RodGeliText> records = new ArrayList<RodGeliText>();
// a little more complicated than the above example, we have to read the file in
AsciiLineReader reader = null;
try {
reader = new AsciiLineReader(new FileInputStream(knownFile));
} catch (FileNotFoundException e) {
Assert.fail("File not found: " + knownFile);
}
String line = "#";
while (line != null && line.startsWith("#"))
line = readLine(reader);
// while we have records, make a Variant Context and output it to a GLF file
while (line != null && line != "") {
boolean parsed = false;
try {
parsed = geliText.parseLine(null,line.split(TabularROD.DEFAULT_DELIMITER_REGEX));
} catch (IOException e) {
Assert.fail("IOException: " + e.getMessage());
}
if (!parsed) Assert.fail("Unable to parse line" + line);
records.add(geliText); // we know they're all single calls in the reference file
VariantContext vc = VariantContextAdaptors.toVariantContext("Geli",geliText);
gw.addCall(vc);
line = readLine(reader);
}
gw.close(); // close the file
reader.close();
// now reopen the file with the temp GLF file and read it back in, compare against what we first stored
geliText = new RodGeliText("myROD");
try {
geliText.initialize(tempFile);
} catch (FileNotFoundException e) {
Assert.fail("Unable to open GLF file" + tempFile);
}
// buffer the new records we see
List<RodGeliText> records2 = new ArrayList<RodGeliText>();
try {
reader = new AsciiLineReader(new FileInputStream(tempFile));
} catch (FileNotFoundException e) {
Assert.fail("File not found: " + tempFile);
}
line = "#";
while (line != null && line.startsWith("#"))
line = readLine(reader);
// while we have records, make a Variant Context and output it to a GLF file
while (line != null && line != "") {
try {
geliText.parseLine(null,line.split(TabularROD.DEFAULT_DELIMITER_REGEX));
} catch (IOException e) {
Assert.fail("IOException: " + e.getMessage());
}
records2.add(geliText); // we know they're all single calls in the reference file
line = readLine(reader);
}
gw.close(); // close the file
reader.close();
// compare sizes
Assert.assertEquals("The input GLF file doesn't contain the same number of records as we saw in the first file", records.size(),records2.size());
// now compare each record TODO: uncomment out next two lines, fix equals so that rounding doesn't ruin our comparison
//for (int x = 0; x < records.size(); x++)
// Assert.assertTrue("GLF Records were not preserved when cycling them to and from disc", records.get(x).equals(records2.get(x)));
}
public String readLine(AsciiLineReader reader) {
try {
String line = reader.readLine();
return line;
} catch (IOException e) {
return null;
}
}
} }