Merge branch 'master' of ssh://gsa2.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Christopher Hartl 2012-09-12 10:09:42 -04:00
commit 546586b70e
14 changed files with 112 additions and 76 deletions

View File

@ -1,9 +1,9 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
import org.testng.annotations.Test;
/**
* Created by IntelliJ IDEA.
@ -52,7 +52,7 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
@Test(enabled = true)
public void testINDEL_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","90af837f372e3d5143af30bf5c8c2b75");
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","567ae6b2a7f839b1307d4087c2f59cca");
}
@Test(enabled = true)
@ -62,7 +62,7 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","26598044436c8044f22ffa767b06a0f0");
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","d2a22e12f1969ae199557947e5039b58");
}
@Test(enabled = true)

View File

@ -38,7 +38,6 @@ import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -198,23 +197,7 @@ public class UnifiedGenotyperEngine {
}
}
return addMissingSamples(results, allSamples);
}
private List<VariantCallContext> addMissingSamples(final List<VariantCallContext> calls, final Set<String> allSamples) {
if ( calls.isEmpty() || allSamples == null ) return calls;
final List<VariantCallContext> withAllSamples = new ArrayList<VariantCallContext>(calls.size());
for ( final VariantCallContext call : calls ) {
if ( call == null )
withAllSamples.add(null);
else {
final VariantContext withoutMissing = VariantContextUtils.addMissingSamples(call, allSamples);
withAllSamples.add(new VariantCallContext(withoutMissing, call.confidentlyCalled, call.shouldEmit));
}
}
return withAllSamples;
return results;
}
/**

View File

@ -313,7 +313,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
VariantContextUtils.calculateChromosomeCounts(builder, false);
if ( minimalVCF )
VariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY));
vcfWriter.add(VariantContextUtils.addMissingSamples(builder.make(), samples));
vcfWriter.add(builder.make());
}
return vcs.isEmpty() ? 0 : 1;

View File

@ -246,7 +246,6 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
}
vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings);
vc = VariantContextUtils.addMissingSamples(vc, samples);
vcfwriter.add(vc);
}

View File

@ -88,8 +88,8 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
case UNBOUNDED: return -1;
case A: return vc.getNAlleles() - 1;
case G:
final int ploidy = vc.getMaxPloidy();
return GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), ploidy == 0 ? 2 : ploidy);
final int ploidy = vc.getMaxPloidy(2);
return GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), ploidy);
default:
throw new ReviewedStingException("Unknown count type: " + countType);
}

View File

@ -53,6 +53,9 @@ import java.util.*;
*/
@Invariant({"alleles != null"})
public final class GenotypeBuilder {
private static final List<Allele> HAPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL);
private static final List<Allele> DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
private String sampleName = null;
private List<Allele> alleles = Collections.emptyList();
@ -90,6 +93,23 @@ public final class GenotypeBuilder {
return new GenotypeBuilder(sampleName, alleles).PL(gls).make();
}
/**
* Create a new Genotype object for a sample that's missing from the VC (i.e., in
* the output header). Defaults to a diploid no call genotype ./.
*
* @param sampleName the name of this sample
* @return an initialized Genotype with sampleName that's a diploid ./. no call genotype
*/
public static Genotype createMissing(final String sampleName, final int ploidy) {
final GenotypeBuilder builder = new GenotypeBuilder(sampleName);
switch ( ploidy ) {
case 1: builder.alleles(HAPLOID_NO_CALL); break;
case 2: builder.alleles(DIPLOID_NO_CALL); break;
default: builder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL)); break;
}
return builder.make();
}
/**
* Create a empty builder. Both a sampleName and alleles must be provided
* before trying to make a Genotype from this builder.

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.utils.variantcontext;
import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import com.google.java.contract.Requires;
import java.util.*;
@ -413,14 +412,26 @@ public class GenotypesContext implements List<Genotype> {
return getGenotypes().get(i);
}
/**
* What is the max ploidy among all samples? Returns defaultPloidy if no genotypes are present
*
* @param defaultPloidy the default ploidy, if all samples are no-called
* @return
*/
@Ensures("result >= 0")
public int getMaxPloidy() {
public int getMaxPloidy(final int defaultPloidy) {
if ( defaultPloidy < 0 ) throw new IllegalArgumentException("defaultPloidy must be greater than or equal to 0");
if ( maxPloidy == -1 ) {
maxPloidy = 0; // necessary in the case where there are no genotypes
for ( final Genotype g : getGenotypes() ) {
maxPloidy = Math.max(g.getPloidy(), maxPloidy);
}
// everything is no called so we return the default ploidy
if ( maxPloidy == 0 ) maxPloidy = defaultPloidy;
}
return maxPloidy;
}

View File

@ -642,14 +642,15 @@ public class VariantContext implements Feature { // to enable tribble integratio
}
/**
* Returns the maximum ploidy of all samples in this VC, or -1 if there are no genotypes
* Returns the maximum ploidy of all samples in this VC, or default if there are no genotypes
*
* This function is caching, so it's only expensive on the first call
*
* @return -1, or the max ploidy
* @param defaultPloidy the default ploidy, if all samples are no-called
* @return default, or the max ploidy
*/
public int getMaxPloidy() {
return genotypes.getMaxPloidy();
public int getMaxPloidy(final int defaultPloidy) {
return genotypes.getMaxPloidy(defaultPloidy);
}
/**

View File

@ -32,8 +32,8 @@ import org.apache.log4j.Logger;
import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -47,7 +47,6 @@ public class VariantContextUtils {
public final static String MERGE_REF_IN_ALL = "ReferenceInAll";
public final static String MERGE_FILTER_PREFIX = "filterIn";
private static final List<Allele> DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
private static Set<String> MISSING_KEYS_WARNED_ABOUT = new HashSet<String>();
final public static JexlEngine engine = new JexlEngine();
@ -60,31 +59,6 @@ public class VariantContextUtils {
engine.setDebug(false);
}
/**
* Ensures that VC contains all of the samples in allSamples by adding missing samples to
* the resulting VC with default diploid ./. genotypes
*
* @param vc the VariantContext
* @param allSamples all of the samples needed
* @return a new VariantContext with missing samples added
*/
public static VariantContext addMissingSamples(final VariantContext vc, final Set<String> allSamples) {
// TODO -- what's the fastest way to do this calculation?
final Set<String> missingSamples = new HashSet<String>(allSamples);
missingSamples.removeAll(vc.getSampleNames());
if ( missingSamples.isEmpty() )
return vc;
else {
//logger.warn("Adding " + missingSamples.size() + " missing samples to called context");
final GenotypesContext gc = GenotypesContext.copy(vc.getGenotypes());
for ( final String missing : missingSamples ) {
gc.add(new GenotypeBuilder(missing).alleles(DIPLOID_NO_CALL).make());
}
return new VariantContextBuilder(vc).genotypes(gc).make();
}
}
/**
* Update the attributes of the attributes map given the VariantContext to reflect the
* proper chromosome-based VCF tags

View File

@ -272,11 +272,7 @@ public abstract class BCF2FieldWriter {
encodingType = BCF2Type.INT8;
buildAlleleMap(vc);
nValuesPerGenotype = vc.getMaxPloidy();
// deal with the case where we have no call everywhere, in which case we write out diploid
if ( nValuesPerGenotype == -1 )
nValuesPerGenotype = 2;
nValuesPerGenotype = vc.getMaxPloidy(2);
super.start(encoder, vc);
}

View File

@ -32,7 +32,10 @@ import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Codec;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils;
import org.broadinstitute.sting.utils.codecs.bcf2.BCFVersion;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFContigHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
@ -345,10 +348,12 @@ class BCF2Writer extends IndexingVariantContextWriter {
final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field);
if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT");
assert writer != null;
writer.start(encoder, vc);
for ( final String name : sampleNames ) {
Genotype g = vc.getGenotype(name);
if ( g == null ) VCFWriter.missingSampleError(vc, header);
if ( g == null ) g = GenotypeBuilder.createMissing(name, writer.nValuesPerGenotype);
writer.addGenotype(encoder, vc, g);
}
writer.done(encoder, vc);

View File

@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.variantcontext.writer;
import net.sf.samtools.SAMSequenceDictionary;
import org.broad.tribble.TribbleException;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -339,13 +338,13 @@ class VCFWriter extends IndexingVariantContextWriter {
*/
private void addGenotypeData(VariantContext vc, Map<Allele, String> alleleMap, List<String> genotypeFormatKeys)
throws IOException {
final int ploidy = vc.getMaxPloidy(2);
for ( String sample : mHeader.getGenotypeSamples() ) {
mWriter.write(VCFConstants.FIELD_SEPARATOR);
Genotype g = vc.getGenotype(sample);
if ( g == null ) {
missingSampleError(vc, mHeader);
}
if ( g == null ) g = GenotypeBuilder.createMissing(sample, ploidy);
final List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
for ( String field : genotypeFormatKeys ) {
@ -426,13 +425,6 @@ class VCFWriter extends IndexingVariantContextWriter {
}
}
public static final void missingSampleError(final VariantContext vc, final VCFHeader header) {
final List<String> badSampleNames = new ArrayList<String>();
for ( final String x : header.getGenotypeSamples() )
if ( ! vc.hasGenotype(x) ) badSampleNames.add(x);
throw new ReviewedStingException("BUG: we now require all samples in VCFheader to have genotype objects. Missing samples are " + Utils.join(",", badSampleNames));
}
private boolean isMissingValue(String s) {
// we need to deal with the case that it's a list of missing values
return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length());

View File

@ -596,6 +596,51 @@ public class VariantContextTestProvider {
return TEST_DATAs;
}
public static void testReaderWriterWithMissingGenotypes(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException {
final int nSamples = data.header.getNGenotypeSamples();
if ( nSamples > 2 ) {
for ( final VariantContext vc : data.vcs )
if ( vc.isSymbolic() )
// cannot handle symbolic alleles because they may be weird non-call VCFs
return;
final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension());
tmpFile.deleteOnExit();
// write expected to disk
final EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
final VariantContextWriter writer = tester.makeWriter(tmpFile, options);
final Set<String> samplesInVCF = new HashSet<String>(data.header.getGenotypeSamples());
final List<String> missingSamples = Arrays.asList("MISSING1", "MISSING2");
final List<String> allSamples = new ArrayList<String>(missingSamples);
allSamples.addAll(samplesInVCF);
final VCFHeader header = new VCFHeader(data.header.getMetaDataInInputOrder(), allSamples);
writeVCsToFile(writer, header, data.vcs);
// ensure writing of expected == actual
final Pair<VCFHeader, Iterable<VariantContext>> p = readAllVCs(tmpFile, tester.makeCodec());
final Iterable<VariantContext> actual = p.getSecond();
int i = 0;
for ( final VariantContext readVC : actual ) {
if ( readVC == null ) continue; // sometimes we read null records...
final VariantContext expected = data.vcs.get(i++);
for ( final Genotype g : readVC.getGenotypes() ) {
Assert.assertTrue(allSamples.contains(g.getSampleName()));
if ( samplesInVCF.contains(g.getSampleName()) ) {
assertEquals(g, expected.getGenotype(g.getSampleName()));
} else {
// missing
Assert.assertTrue(g.isNoCall());
}
}
}
}
}
public static void testReaderWriter(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException {
testReaderWriter(tester, data.header, data.vcs, data.vcs, true);
}

View File

@ -82,6 +82,11 @@ public class VariantContextWritersUnitTest extends BaseTest {
VariantContextTestProvider.testReaderWriter(new BCFIOTester(), testData);
}
@Test(dataProvider = "VariantContextTest_SingleContexts")
public void testBCF2WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException {
VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new BCFIOTester(), testData);
}
private class BCFIOTester extends VariantContextTestProvider.VariantContextIOTest {
@Override
public String getExtension() {
@ -110,6 +115,11 @@ public class VariantContextWritersUnitTest extends BaseTest {
VariantContextTestProvider.testReaderWriter(new VCFIOTester(), testData);
}
@Test(enabled = true, dataProvider = "VariantContextTest_SingleContexts")
public void testVCF4WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException {
VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new VCFIOTester(), testData);
}
private class VCFIOTester extends VariantContextTestProvider.VariantContextIOTest {
@Override
public String getExtension() {