From 98e3a0bf1ac0801a148cb83c7cb9bdbfe415379d Mon Sep 17 00:00:00 2001 From: aaron Date: Thu, 8 Oct 2009 19:50:04 +0000 Subject: [PATCH] VCF can now be emitted from SSG. The basic's are there (the genotype, read depth, our error estimate), but more fields need to be added for each record as nessasary. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1797 348d0f76-0448-11de-a6fe-93d51630548a --- .../genotyper/EMGenotypeCalculationModel.java | 22 +- ...{SSGenotypeCall.java => GenotypeCall.java} | 71 +++-- .../genotyper/SingleSampleGenotyper.java | 51 ++-- .../walkers/genotyper/UnifiedGenotyper.java | 30 ++- .../gatk/walkers/CoverageEvalWalker.java | 4 +- .../gatk/walkers/DeNovoSNPWalker.java | 17 +- .../sting/utils/genotype/GenotypeCall.java | 53 ---- .../sting/utils/genotype/GenotypeWriter.java | 13 + .../utils/genotype/GenotypeWriterFactory.java | 7 +- .../sting/utils/genotype/SampleBacked.java | 23 ++ .../utils/genotype/geli/GeliAdapter.java | 23 +- .../utils/genotype/geli/GeliTextWriter.java | 25 +- .../sting/utils/genotype/glf/GLFWriter.java | 15 ++ .../genotype/tabular/TabularLFWriter.java | 18 ++ .../vcf/VCFGenotypeWriterAdapter.java | 247 ++++++++++++++++++ .../sting/utils/genotype/vcf/VCFRecord.java | 10 +- .../sting/utils/genotype/vcf/VCFWriter.java | 4 +- ...ypeCallTest.java => GenotypeCallTest.java} | 14 +- 18 files changed, 494 insertions(+), 153 deletions(-) rename java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{SSGenotypeCall.java => GenotypeCall.java} (78%) delete mode 100644 java/src/org/broadinstitute/sting/utils/genotype/GenotypeCall.java create mode 100644 java/src/org/broadinstitute/sting/utils/genotype/SampleBacked.java create mode 100644 java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java rename java/test/org/broadinstitute/sting/gatk/walkers/genotyper/{SSGenotypeCallTest.java => GenotypeCallTest.java} (82%) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java index 3bb73c55b..4689d31fe 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java @@ -1,15 +1,16 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.genotype.GenotypeWriter; -import org.broadinstitute.sting.utils.genotype.DiploidGenotype; +import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotype.DiploidGenotype; -import java.util.*; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMReadGroupRecord; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; public class EMGenotypeCalculationModel extends GenotypeCalculationModel { @@ -79,14 +80,15 @@ public class EMGenotypeCalculationModel extends GenotypeCalculationModel { // for now, we need to special-case single sample mode if ( samples.size() == 1 ) { - UnifiedGenotypeLikelihoods UGL = GLs.get(samples.iterator().next()); + String sample = samples.iterator().next(); + UnifiedGenotypeLikelihoods UGL = GLs.get(sample); // if there were no good bases, the likelihoods object wouldn't exist if ( UGL == null ) return false; callsMetrics.nCalledBases++; UGL.setPriors(priors); - SSGenotypeCall call = new SSGenotypeCall(context.getLocation(), ref, UGL.getGenotypeLikelihoods(), new ReadBackedPileup(ref, context)); + GenotypeCall call = new GenotypeCall(sample,context.getLocation(), ref, UGL.getGenotypeLikelihoods(), new ReadBackedPileup(ref, context)); if ( GENOTYPE_MODE || call.isVariant(call.getReference()) ) { double confidence = (GENOTYPE_MODE ? call.getNegLog10PError() : call.toVariation().getNegLog10PError()); @@ -183,7 +185,7 @@ public class EMGenotypeCalculationModel extends GenotypeCalculationModel { */ - //return new SSGenotypeCall(context.getLocation(), ref,gl, pileup); + //return new GenotypeCall(context.getLocation(), ref,gl, pileup); return true; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGenotypeCall.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCall.java similarity index 78% rename from java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGenotypeCall.java rename to java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCall.java index 74d51053b..88274e596 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGenotypeCall.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCall.java @@ -15,10 +15,10 @@ import java.util.List; *

* Class SSGenotypeCall *

- * The single sample implementation of the genotype interface, which contains + * The mplementation of the genotype interface, which contains * extra information for the various genotype outputs */ -public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, LikelihoodsBacked, PosteriorsBacked { +public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, LikelihoodsBacked, PosteriorsBacked, SampleBacked { private final char mRefBase; private final GenotypeLikelihoods mGenotypeLikelihoods; @@ -33,18 +33,20 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li private DiploidGenotype mRefGenotype = null; private DiploidGenotype mNextGenotype = null; - // are we best vrs ref or best vrs next - for internal consumption only - //private final boolean mBestVrsRef; + // the sample name, used to propulate the SampleBacked interface + private String mSampleName; /** * Generate a single sample genotype object, containing everything we need to represent calls out of a genotyper object * - * @param location the location we're working with - * @param refBase the ref base - * @param gtlh the genotype likelihoods object - * @param pileup the pile-up of reads at the specified locus + * @param sampleName the sample name + * @param location the location we're working with + * @param refBase the ref base + * @param gtlh the genotype likelihoods object + * @param pileup the pile-up of reads at the specified locus */ - public SSGenotypeCall(GenomeLoc location, char refBase, GenotypeLikelihoods gtlh, ReadBackedPileup pileup) { + public GenotypeCall(String sampleName, GenomeLoc location, char refBase, GenotypeLikelihoods gtlh, ReadBackedPileup pileup) { + mSampleName = sampleName; mRefBase = String.valueOf(refBase).toUpperCase().charAt(0); // a round about way to make sure the ref base is up-case mGenotypeLikelihoods = gtlh; mLocation = location; @@ -54,12 +56,14 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li /** * Generate a single sample genotype object, containing everything we need to represent calls out of a genotyper object * - * @param location the location we're working with - * @param refBase the ref base - * @param gtlh the genotype likelihoods object - * @param pileup the pile-up of reads at the specified locus + * @param sampleName the sample name + * @param location the location we're working with + * @param refBase the ref base + * @param gtlh the genotype likelihoods object + * @param pileup the pile-up of reads at the specified locus */ - SSGenotypeCall(GenomeLoc location, char refBase, GenotypeLikelihoods gtlh, ReadBackedPileup pileup, DiploidGenotype genotype) { + GenotypeCall(String sampleName, GenomeLoc location, char refBase, GenotypeLikelihoods gtlh, ReadBackedPileup pileup, DiploidGenotype genotype) { + mSampleName = sampleName; mRefBase = String.valueOf(refBase).toUpperCase().charAt(0); // a round about way to make sure the ref base is up-case mGenotypeLikelihoods = gtlh; mLocation = location; @@ -73,8 +77,8 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li if (other == null) return false; - if (other instanceof SSGenotypeCall) { - SSGenotypeCall otherCall = (SSGenotypeCall) other; + if (other instanceof GenotypeCall) { + GenotypeCall otherCall = (GenotypeCall) other; if (!this.mGenotypeLikelihoods.equals(otherCall.mGenotypeLikelihoods)) { return false; @@ -90,8 +94,8 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li public String toString() { lazyEval(); return String.format("%s best=%s cmp=%s ref=%s depth=%d negLog10PError = %.2f, likelihoods=%s", - getLocation(), mGenotype, mRefGenotype, mRefBase, mPileup.getReads().size(), - getNegLog10PError(), Arrays.toString(mGenotypeLikelihoods.getLikelihoods())); + getLocation(), mGenotype, mRefGenotype, mRefBase, mPileup.getReads().size(), + getNegLog10PError(), Arrays.toString(mGenotypeLikelihoods.getLikelihoods())); } private void lazyEval() { @@ -122,25 +126,19 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li return Math.abs(mGenotypeLikelihoods.getPosterior(getBestGenotype()) - mGenotypeLikelihoods.getPosterior(getNextBest())); } - /** - * get the best genotype - */ + /** get the best genotype */ private DiploidGenotype getBestGenotype() { lazyEval(); return mGenotype; } - /** - * get the alternate genotype - */ + /** get the alternate genotype */ private DiploidGenotype getNextBest() { lazyEval(); return mNextGenotype; } - /** - * get the alternate genotype - */ + /** get the alternate genotype */ private DiploidGenotype getRefGenotype() { lazyEval(); return mRefGenotype; @@ -211,6 +209,7 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li * given the reference, are we a variant? (non-ref) * * @param ref the reference base or bases + * * @return true if we're a variant */ @Override @@ -270,7 +269,7 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li * @return an array in lexigraphical order of the likelihoods */ public Genotype getGenotype(DiploidGenotype x) { - return new SSGenotypeCall(mLocation, mRefBase, mGenotypeLikelihoods, mPileup, x); + return new GenotypeCall(mSampleName, mLocation, mRefBase, mGenotypeLikelihoods, mPileup, x); } /** @@ -292,6 +291,22 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li public double[] getPosteriors() { return this.mGenotypeLikelihoods.getPosteriors(); } + + /** @return returns the sample name for this genotype */ + @Override + public String getSampleName() { + return this.mSampleName; + } + + /** + * get the filtering string for this genotype + * + * @return a string, representing the genotyping value + */ + @Override + public String getFilteringValue() { + return "0"; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java index dbd58589a..731026c13 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java @@ -10,13 +10,15 @@ import org.broadinstitute.sting.gatk.walkers.ReadFilters; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.ReadBackedPileup; import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.genotype.Genotype; import org.broadinstitute.sting.utils.genotype.GenotypeWriter; import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; import java.io.File; +import java.util.Arrays; @ReadFilters(ZeroMappingQualityReadFilter.class) -public class SingleSampleGenotyper extends LocusWalker { +public class SingleSampleGenotyper extends LocusWalker { // Control output settings @Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = false) public File VARIANTS_FILE = null; @@ -47,6 +49,8 @@ public class SingleSampleGenotyper extends LocusWalker= LOD_THRESHOLD) { sum.nConfidentCalls++; //System.out.printf("Call %s%n", call); - sum.writer.addGenotypeCall(call); + if (sum.writer.supportsMulitSample()) { + sum.writer.addMultiSampleCall(Arrays.asList((Genotype)call)); + } else { + sum.writer.addGenotypeCall(call); + } } else sum.nNonConfidentCalls++; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 49b50928e..3a1606ff0 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -25,19 +25,25 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; +import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.*; -import org.broadinstitute.sting.gatk.filters.*; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.cmdLine.*; -import org.broadinstitute.sting.utils.genotype.*; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.MissingReadGroupFilter; +import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.rodDbSNP; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection; +import org.broadinstitute.sting.utils.genotype.GenotypeWriter; +import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; import java.io.File; -import java.util.*; - -import net.sf.samtools.SAMReadGroupRecord; +import java.util.HashSet; +import java.util.List; @ReadFilters({ZeroMappingQualityReadFilter.class, MissingReadGroupFilter.class}) @@ -91,7 +97,9 @@ public class UnifiedGenotyper extends LocusWalker { // create the output writer stream if ( VARIANTS_FILE != null ) - writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), VARIANTS_FILE); + writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), VARIANTS_FILE, + "UnifiedGenotyper", + this.getToolkit().getArguments().referenceFile.getName()); else writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java index 5d5c3ea04..b79bb4716 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageEvalWalker.java @@ -6,7 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RodGenotypeChipAsGFF; import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.genotyper.SSGenotypeCall; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCall; import org.broadinstitute.sting.gatk.walkers.genotyper.SingleSampleGenotyper; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.utils.BaseUtils; @@ -80,7 +80,7 @@ public class CoverageEvalWalker extends LocusWalker, String> { List sub_offsets = ListUtils.sliceListByIndices(subset_indices, offsets); AlignmentContext subContext = new AlignmentContext(context.getLocation(), sub_reads, sub_offsets); - SSGenotypeCall call = SSG.map(tracker, ref, subContext); + GenotypeCall call = SSG.map(tracker, ref, subContext); String callType = (call.isVariant(call.getReference())) ? ((call.isHom()) ? "HomozygousSNP" : "HeterozygousSNP") : "HomozygousReference"; if (call != null) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/DeNovoSNPWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/DeNovoSNPWalker.java index 961014981..c1c6d163b 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/DeNovoSNPWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/DeNovoSNPWalker.java @@ -1,22 +1,19 @@ package org.broadinstitute.sting.playground.gatk.walkers; +import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.VariationRod; import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCall; import org.broadinstitute.sting.gatk.walkers.genotyper.SingleSampleGenotyper; -import org.broadinstitute.sting.gatk.walkers.genotyper.SSGenotypeCall; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype; import org.broadinstitute.sting.utils.genotype.Variation; -import java.util.Set; -import java.util.List; import java.util.ArrayList; - -import net.sf.samtools.SAMRecord; +import java.util.List; +import java.util.Set; /** * Created by IntelliJ IDEA. @@ -73,10 +70,10 @@ public class DeNovoSNPWalker extends RefWalker{ } AlignmentContext parent1_subContext = new AlignmentContext(context.getLocation(), parent1_reads, parent1_offsets); - SSGenotypeCall parent1 = SSG.map(tracker, ref, parent1_subContext); + GenotypeCall parent1 = SSG.map(tracker, ref, parent1_subContext); AlignmentContext parent2_subContext = new AlignmentContext(context.getLocation(), parent2_reads, parent2_offsets); - SSGenotypeCall parent2 = SSG.map(tracker, ref, parent2_subContext); + GenotypeCall parent2 = SSG.map(tracker, ref, parent2_subContext); if (!parent1.isVariant(parent1.getReference()) && parent1.getNegLog10PError() > 5 && diff --git a/java/src/org/broadinstitute/sting/utils/genotype/GenotypeCall.java b/java/src/org/broadinstitute/sting/utils/genotype/GenotypeCall.java deleted file mode 100644 index 9e087c77c..000000000 --- a/java/src/org/broadinstitute/sting/utils/genotype/GenotypeCall.java +++ /dev/null @@ -1,53 +0,0 @@ -package org.broadinstitute.sting.utils.genotype; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.genotype.Genotype; - -import java.util.List; - -/** - * @author aaron - *

- * Interface GenotypeCall - *

- * Genotype call interface, for indicating that a genotype is - * also a genotype call. - */ -public interface GenotypeCall extends Genotype { - /** - * gets the reference base - * - * @return the reference base we represent - */ - public char getReferencebase(); - - /** - * check to see if this called genotype is a variant, i.e. not homozygous reference - * @return true if it's not hom ref, false otherwise - */ - public boolean isVariation(); - - /** - * Location of this genotype on the reference (on the forward strand). If the allele is insertion/deletion, the first inserted/deleted base - * is located right after the specified location - * - * @return position on the genome wrapped in GenomeLoc object - */ - public GenomeLoc getLocation(); - - /** - * get the genotypes, sorted in asscending order by their ConfidenceScores (the best - * to the worst ConfidenceScores) - * - * @return a list of the genotypes, sorted by ConfidenceScores - */ - public List getGenotypes(); - - /** - * get the genotypes sorted lexigraphically - * - * @return a list of the genotypes sorted lexi - */ - public List getLexigraphicallySortedGenotypes(); - -} diff --git a/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriter.java index 90d3210b8..8ae61f08a 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriter.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.utils.genotype; +import java.util.List; + /* * Copyright (c) 2009 The Broad Institute * @@ -50,4 +52,15 @@ public interface GenotypeWriter { /** finish writing, closing any open files. */ public void close(); + /** + * add a multi-sample call if we support it + * @param genotypes the list of genotypes, that are backed by sample information + */ + public void addMultiSampleCall(List genotypes); + + /** + * @return true if we support multisample, false otherwise + */ + public boolean supportsMulitSample(); + } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriterFactory.java b/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriterFactory.java index e2ad1968b..94c36da8f 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriterFactory.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriterFactory.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.genotype.geli.GeliAdapter; import org.broadinstitute.sting.utils.genotype.geli.GeliTextWriter; import org.broadinstitute.sting.utils.genotype.glf.GLFWriter; +import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeWriterAdapter; import java.io.File; import java.io.PrintStream; @@ -20,7 +21,7 @@ import java.io.PrintStream; public class GenotypeWriterFactory { /** available genotype writers */ public enum GENOTYPE_FORMAT { - GELI, GLF, GFF, TABULAR, GELI_BINARY; + GELI, GLF, GFF, TABULAR, GELI_BINARY, VCF; } /** @@ -30,7 +31,7 @@ public class GenotypeWriterFactory { * @param destination the destination file * @return the genotype writer object */ - public static GenotypeWriter create(GENOTYPE_FORMAT format, SAMFileHeader header, File destination) { + public static GenotypeWriter create(GENOTYPE_FORMAT format, SAMFileHeader header, File destination, String source, String referenceName ) { switch (format) { case GLF: return new GLFWriter(header.toString(), destination); @@ -38,6 +39,8 @@ public class GenotypeWriterFactory { return new GeliTextWriter(destination); case GELI_BINARY: return new GeliAdapter(destination, header); + case VCF: + return new VCFGenotypeWriterAdapter(source, referenceName, destination); default: throw new StingException("Genotype writer " + format.toString() + " is not implemented"); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/SampleBacked.java b/java/src/org/broadinstitute/sting/utils/genotype/SampleBacked.java new file mode 100644 index 000000000..583e20dfb --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/genotype/SampleBacked.java @@ -0,0 +1,23 @@ +package org.broadinstitute.sting.utils.genotype; + +/** + * @author aaron + *

+ * Interface SampleBacked + *

+ * A descriptions should go here. Blame aaron if it's missing. + */ +public interface SampleBacked { + + /** + * + * @return returns the sample name for this genotype + */ + public String getSampleName(); + + /** + * get the filtering string for this genotype + * @return a string, representing the genotyping value + */ + public String getFilteringValue(); +} diff --git a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java index c56ecfa87..c3fc7c1a3 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliAdapter.java @@ -5,7 +5,6 @@ import edu.mit.broad.picard.genotype.geli.GenotypeLikelihoods; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceRecord; -import org.broadinstitute.sting.gatk.walkers.genotyper.SSGenotypeCall; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.genotype.*; @@ -111,7 +110,7 @@ public class GeliAdapter implements GenotypeWriter { int readDepth = -1; double nextVrsBest = 0; double nextVrsRef = 0; - if (!(locus instanceof GenotypesBacked)) { + if (!(locus instanceof PosteriorsBacked)) { posteriors = new double[10]; Arrays.fill(posteriors, Double.MIN_VALUE); } else { @@ -128,8 +127,8 @@ public class GeliAdapter implements GenotypeWriter { if (maxMappingQual < rec.getMappingQuality()) maxMappingQual = rec.getMappingQuality(); } } - SSGenotypeCall call = (SSGenotypeCall)locus; - LikelihoodObject obj = new LikelihoodObject(call.getProbabilities(), LikelihoodObject.LIKELIHOOD_TYPE.LOG); + + LikelihoodObject obj = new LikelihoodObject(posteriors, LikelihoodObject.LIKELIHOOD_TYPE.LOG); this.addGenotypeCall(GenomeLocParser.getContigInfo(locus.getLocation().getContig()), (int)locus.getLocation().getStart(), ref, @@ -155,4 +154,20 @@ public class GeliAdapter implements GenotypeWriter { this.writer.close(); } } + + /** + * add a multi-sample call if we support it + * + * @param genotypes the list of genotypes, that are backed by sample information + */ + @Override + public void addMultiSampleCall( List genotypes) { + throw new UnsupportedOperationException("Geli binary doesn't support multisample calls"); + } + + /** @return true if we support multisample, false otherwise */ + @Override + public boolean supportsMulitSample() { + return false; + } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java index 58727e134..912d60a1d 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/geli/GeliTextWriter.java @@ -1,4 +1,3 @@ - package org.broadinstitute.sting.utils.genotype.geli; import net.sf.samtools.SAMRecord; @@ -66,17 +65,17 @@ public class GeliTextWriter implements GenotypeWriter { } else { posteriors = ((PosteriorsBacked) locus).getPosteriors(); double[] lks; - lks = Arrays.copyOf(posteriors,posteriors.length); + lks = Arrays.copyOf(posteriors, posteriors.length); Arrays.sort(lks); nextVrsBest = lks[9] - lks[8]; - if (ref != 'X') { - int index = (DiploidGenotype.valueOf(Utils.dupString(ref,2)).ordinal()); + if (ref != 'X') { + int index = (DiploidGenotype.valueOf(Utils.dupString(ref, 2)).ordinal()); nextVrsRef = lks[9] - posteriors[index]; } } double maxMappingQual = 0; if (locus instanceof ReadBacked) { - List recs = ((ReadBacked)locus).getReads(); + List recs = ((ReadBacked) locus).getReads(); readDepth = recs.size(); for (SAMRecord rec : recs) { if (maxMappingQual < rec.getMappingQuality()) maxMappingQual = rec.getMappingQuality(); @@ -118,4 +117,20 @@ public class GeliTextWriter implements GenotypeWriter { public void close() { mWriter.close(); } + + /** + * add a multi-sample call if we support it + * + * @param genotypes the list of genotypes, that are backed by sample information + */ + @Override + public void addMultiSampleCall(List genotypes) { + throw new UnsupportedOperationException("Geli text doesn't support multisample calls"); + } + + /** @return true if we support multisample, false otherwise */ + @Override + public boolean supportsMulitSample() { + return false; + } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java index 9f6e096c8..96583f2d3 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java @@ -278,6 +278,21 @@ public class GLFWriter implements GenotypeWriter { } + /** + * add a multi-sample call if we support it + * + * @param genotypes the list of genotypes, that are backed by sample information + */ + @Override + public void addMultiSampleCall(List genotypes) { + throw new UnsupportedOperationException("GLF writer doesn't support multisample calls"); + } + + /** @return true if we support multisample, false otherwise */ + @Override + public boolean supportsMulitSample() { + return false; + } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/tabular/TabularLFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/tabular/TabularLFWriter.java index 879e3f08b..ccf94193a 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/tabular/TabularLFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/tabular/TabularLFWriter.java @@ -7,6 +7,7 @@ import java.io.File; import java.io.FileNotFoundException; import java.io.PrintStream; import java.util.Arrays; +import java.util.List; /** @@ -95,4 +96,21 @@ public class TabularLFWriter implements GenotypeWriter { outStream.close(); } } + + + /** + * add a multi-sample call if we support it + * + * @param genotypes the list of genotypes, that are backed by sample information + */ + @Override + public void addMultiSampleCall(List genotypes) { + throw new UnsupportedOperationException("Tabular LF doesn't support multisample calls"); + } + + /** @return true if we support multisample, false otherwise */ + @Override + public boolean supportsMulitSample() { + return false; + } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java new file mode 100644 index 000000000..a76ea29ad --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeWriterAdapter.java @@ -0,0 +1,247 @@ +package org.broadinstitute.sting.utils.genotype.vcf; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.genotype.GenotypeWriter; +import org.broadinstitute.sting.utils.genotype.ReadBacked; +import org.broadinstitute.sting.utils.genotype.SampleBacked; + +import java.io.File; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + + +/** + * @author aaron + *

+ * Class VCFGenotypeWriterAdapter + *

+ * Adapt the VCF writter to the genotype output system + */ +public class VCFGenotypeWriterAdapter implements GenotypeWriter { + // our VCF objects + private VCFWriter mWriter = null; + private VCFHeader mHeader = null; + private String mSource; + private String mReferenceName; + private final Map mSampleNames = new HashMap(); + private boolean mInitialized = false; + private final File mFile; + + public VCFGenotypeWriterAdapter(String source, String referenceName, File writeTo) { + mReferenceName = referenceName; + mSource = source; + mFile = writeTo; + } + + + /** + * initialize this VCF writer + * + * @param genotypes the genotypes + * @param file the file location to write to + */ + private void lazyInitialize(List genotypes, File file) { + Map hInfo = new HashMap(); + List sampleNames = getSampleNames(genotypes); + + // setup the header fields + hInfo.put("format", "VCRv3.2"); + hInfo.put("source", mSource); + + // setup the sample names + mHeader = new VCFHeader(hInfo, sampleNames); + mWriter = new VCFWriter(mHeader, file); + mInitialized = true; + } + + /** + * get the samples names from genotype objects + * + * @param genotypes the genotype list + * + * @return a list of strings representing the sample names + */ + private static List getSampleNames(List genotypes) { + List strings = new ArrayList(); + for (Genotype genotype : genotypes) { + if (!(genotype instanceof SampleBacked)) + throw new IllegalArgumentException("Genotypes passed to VCF must be backed by SampledBacked interface"); + strings.add(((SampleBacked) genotype).getSampleName()); + } + return strings; + } + + /** + * Add a genotype, given a genotype locus + * + * @param call the locus to add + */ + @Override + public void addGenotypeCall(Genotype call) { + throw new UnsupportedOperationException("VCF is a multi sample fomat"); + } + + /** + * add a no call to the genotype file, if supported. + * + * @param position the position to add the no call at + */ + @Override + public void addNoCall(int position) { + throw new UnsupportedOperationException("VCF is a multi sample fomat"); + } + + /** finish writing, closing any open files. */ + @Override + public void close() { + mWriter.close(); + } + + /** + * add a multi-sample call if we support it + * + * @param genotypes the list of genotypes, that are backed by sample information + */ + @Override + public void addMultiSampleCall(List genotypes) { + if (!mInitialized) + lazyInitialize(genotypes, mFile); + + + VCFParamters params = new VCFParamters(); + params.addFormatItem("GT"); + + for (Genotype gtype : genotypes) { + // setup the parameters + params.setLocations(gtype.getLocation(), gtype.getReference()); + + Map map = new HashMap(); + if (!(gtype instanceof SampleBacked)) { + throw new IllegalArgumentException("Genotypes passed to VCF must be backed by SampledBacked interface"); + } + + // calculate the RMS mapping qualities and the read depth + if (gtype instanceof ReadBacked) { + int readDepth = ((ReadBacked) gtype).getReadCount(); + map.put("RD", String.valueOf(readDepth)); + params.addFormatItem("RD"); + } + double qual = gtype.getNegLog10PError(); + map.put("GQ", String.format("%.2f", qual)); + params.addFormatItem("GQ"); + + List alleles = new ArrayList(); + for (char allele : gtype.getBases().toCharArray()) { + alleles.add(String.valueOf(allele)); + params.addAlternateBase(allele); + } + + VCFGenotypeRecord record = new VCFGenotypeRecord(((SampleBacked) gtype).getSampleName(), + alleles, + VCFGenotypeRecord.PHASE.UNPHASED, + map); + params.addGenotypeRecord(record); + } + + Map infoFields = new HashMap(); + + VCFRecord vcfRecord = new VCFRecord(params.getReferenceBase(), + params.getContig(), + params.getPosition(), + ".", + params.getAlternateBases(), + 0, /* BETTER VALUE HERE */ + ".", + infoFields, + params.getFormatString(), + params.getGenotypesRecords()); + + mWriter.addRecord(vcfRecord); + } + + /** @return true if we support multisample, false otherwise */ + @Override + public boolean supportsMulitSample() { + return true; + } + + + /** + * a helper class, which performs a lot of the safety checks on the parameters + * we feed to the VCF (like ensuring the same position for each genotype in a call). + */ + class VCFParamters { + private char referenceBase = '0'; + private int position = 0; + private String contig = null; + private boolean initialized = false; + private List genotypesRecord = new ArrayList(); + private List formatList = new ArrayList(); + private List alternateBases = new ArrayList(); + + public void setLocations(GenomeLoc location, char refBase) { + // if we haven't set it up, we initialize the object + if (!initialized) { + initialized = true; + this.contig = location.getContig(); + this.position = (int)location.getStart(); + if (location.getStart() != location.getStop()) { + throw new IllegalArgumentException("The start and stop locations must be the same"); + } + this.referenceBase = refBase; + } else { + if (contig.equals(this.contig)) + throw new IllegalArgumentException("The contig name has to be the same at a single locus"); + if (position != this.position) + throw new IllegalArgumentException("The position has to be the same at a single locus"); + if (refBase != this.referenceBase) + throw new IllegalArgumentException("The reference base name has to be the same at a single locus"); + } + } + + /** @return get the position */ + public int getPosition() { + return position; + } + + /** @return get the contig name */ + public String getContig() { + return contig; + } + + /** @return get the reference base */ + public char getReferenceBase() { + return referenceBase; + } + + public void addGenotypeRecord(VCFGenotypeRecord record) { + this.genotypesRecord.add(record); + } + + public void addFormatItem(String item) { + if (!formatList.contains(item)) + formatList.add(item); + } + + public void addAlternateBase(char base) { + if (!alternateBases.contains(String.valueOf(base)) && base != this.getReferenceBase()) + alternateBases.add(String.valueOf(base)); + } + + public List getAlternateBases() { + return alternateBases; + } + + public String getFormatString() { + return Utils.join(";", formatList); + } + + public List getGenotypesRecords() { + return genotypesRecord; + } + } +} diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java index 2682e5a65..f2b1d65f7 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -216,8 +216,8 @@ public class VCFRecord { */ public Map getInfoValues() { if (this.mInfoFields.size() < 1) { - Map map = new HashMap(); - map.put(".",""); + Map map = new HashMap(); + map.put(".", ""); return map; } return this.mInfoFields; @@ -305,6 +305,10 @@ public class VCFRecord { this.mInfoFields.put(key, value); } + /** + * the generation of a string representation, which is used by the VCF writer + * @return a string + */ public String toString() { StringBuilder builder = new StringBuilder(); @@ -340,7 +344,7 @@ public class VCFRecord { if (rec.getFields().get(s).equals("")) continue; builder.append(":"); builder.append(rec.getFields().get(s)); - } + } } } return builder.toString(); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index 3da5c2757..773bf05f3 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -70,14 +70,16 @@ public class VCFWriter { } + + /** attempt to close the VCF file */ public void close() { try { + mWriter.flush(); mWriter.close(); } catch (IOException e) { throw new RuntimeException("Unable to close VCFFile"); } } - } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/SSGenotypeCallTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCallTest.java similarity index 82% rename from java/test/org/broadinstitute/sting/gatk/walkers/genotyper/SSGenotypeCallTest.java rename to java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCallTest.java index 03c623283..27de8d5ff 100644 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/SSGenotypeCallTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCallTest.java @@ -22,7 +22,7 @@ import java.util.List; *

* test the SS Genotype call class */ -public class SSGenotypeCallTest extends BaseTest { +public class GenotypeCallTest extends BaseTest { // we need a fake GenotypeLikelihoods class public class GenotypeLikelihoodsImpl extends GenotypeLikelihoods { @@ -79,21 +79,21 @@ public class SSGenotypeCallTest extends BaseTest { @Test public void testBestVrsRefSame() { Pair myPair = makePileup(); - SSGenotypeCall call = new SSGenotypeCall(myPair.second, 'A', new GenotypeLikelihoodsImpl(), myPair.first); + GenotypeCall call = new GenotypeCall("TESTSAMPLE",myPair.second, 'A', new GenotypeLikelihoodsImpl(), myPair.first); Assert.assertEquals(0, call.toVariation().getNegLog10PError(), 0.0000001); } @Test public void testBestVrsRef2() { Pair myPair = makePileup(); - SSGenotypeCall call2 = new SSGenotypeCall(myPair.second, 'T', new GenotypeLikelihoodsImpl(), myPair.first); + GenotypeCall call2 = new GenotypeCall("TESTSAMPLE",myPair.second, 'T', new GenotypeLikelihoodsImpl(), myPair.first); Assert.assertEquals(9, call2.toVariation().getNegLog10PError(), 0.0000001); } @Test public void testBestVrsRef3() { Pair myPair = makePileup(); - SSGenotypeCall call3 = new SSGenotypeCall(myPair.second, 'C', new GenotypeLikelihoodsImpl(), myPair.first); + GenotypeCall call3 = new GenotypeCall("TESTSAMPLE",myPair.second, 'C', new GenotypeLikelihoodsImpl(), myPair.first); Assert.assertEquals(4, call3.toVariation().getNegLog10PError(), 0.0000001); } @@ -101,21 +101,21 @@ public class SSGenotypeCallTest extends BaseTest { @Test public void testBestVrsNextSame() { Pair myPair = makePileup(); - SSGenotypeCall call = new SSGenotypeCall(myPair.second, 'A', new GenotypeLikelihoodsImpl(), myPair.first); + GenotypeCall call = new GenotypeCall("TESTSAMPLE",myPair.second, 'A', new GenotypeLikelihoodsImpl(), myPair.first); Assert.assertEquals(1, call.getNegLog10PError(), 0.0000001); } @Test public void testBestVrsNext2() { Pair myPair = makePileup(); - SSGenotypeCall call2 = new SSGenotypeCall(myPair.second, 'A', new GenotypeLikelihoodsImpl(), myPair.first); + GenotypeCall call2 = new GenotypeCall("TESTSAMPLE",myPair.second, 'A', new GenotypeLikelihoodsImpl(), myPair.first); Assert.assertEquals(1, call2.getNegLog10PError(), 0.0000001); } @Test public void testBestVrsNext3() { Pair myPair = makePileup(); - SSGenotypeCall call3 = new SSGenotypeCall(myPair.second, 'C', new GenotypeLikelihoodsImpl(), myPair.first); + GenotypeCall call3 = new GenotypeCall("TESTSAMPLE",myPair.second, 'C', new GenotypeLikelihoodsImpl(), myPair.first); Assert.assertEquals(1, call3.getNegLog10PError(), 0.0000001); } }