Continuing bugfixes to get new VC working

This commit is contained in:
Mark DePristo 2011-11-16 10:39:17 -05:00
parent df415da4ab
commit e56d52006a
5 changed files with 77 additions and 73 deletions

View File

@ -333,11 +333,11 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
VariantContext filteredVC;
if ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES )
filteredVC = new VariantContext("outputvcf", VCFConstants.EMPTY_ID_FIELD, vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes());
filteredVC = new VariantContext("outputvcf", vc_input.getID(), vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes());
else {
Set<String> removedFilters = vc_input.filtersWereApplied() ? new HashSet<String>(vc_input.getFilters()) : new HashSet<String>(1);
removedFilters.add(String.format("BGL_RM_WAS_%s",vc_input.getAlternateAllele(0)));
filteredVC = new VariantContext("outputvcf", VCFConstants.EMPTY_ID_FIELD, vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), new HashSet<Allele>(Arrays.asList(vc_input.getReference())), genotypes, vc_input.getNegLog10PError(), removedFilters, vc_input.getAttributes());
filteredVC = new VariantContext("outputvcf", vc_input.getID(), vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), new HashSet<Allele>(Arrays.asList(vc_input.getReference())), genotypes, vc_input.getNegLog10PError(), removedFilters, vc_input.getAttributes());
}
HashMap<String, Object> attributes = new HashMap<String, Object>(filteredVC.getAttributes());

View File

@ -654,7 +654,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
if ( samples == null || samples.isEmpty() )
return vc;
// logger.info("Genotypes in full vc: " + vc.getGenotypes());
// logger.info("My own sub : " + vc.getGenotypes().subsetToSamples(samples));
VariantContext sub = vc.subContextFromSamples(samples, vc.getAlleles());
// logger.info("Genotypes in sub vc: " + sub.getGenotypes());
// if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate)
if ( vc.getAlleles().size() != sub.getAlleles().size() )
@ -691,6 +694,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
sub = VariantContext.modifyAttributes(sub, attributes);
// logger.info("Genotypes in final vc: " + sub.getGenotypes());
return sub;
}

View File

@ -30,11 +30,13 @@ import java.util.*;
*
*/
public class GenotypeCollection implements List<Genotype> {
public final static GenotypeCollection NO_GENOTYPES = new GenotypeCollection();
public final static GenotypeCollection NO_GENOTYPES =
new GenotypeCollection(new ArrayList<Genotype>(0), new HashMap<String, Integer>(0), new HashSet<String>(0), true);
Set<String> sampleNamesInOrder = null;
Map<String, Integer> sampleNameToOffset = null;
boolean cacheIsInvalid = true;
final ArrayList<Genotype> genotypes;
List<Genotype> genotypes;
boolean immutable = false;
// ---------------------------------------------------------------------------
@ -54,6 +56,19 @@ public class GenotypeCollection implements List<Genotype> {
private GenotypeCollection(final ArrayList<Genotype> genotypes, final boolean immutable) {
this.genotypes = genotypes;
this.immutable = immutable;
this.sampleNameToOffset = null;
this.cacheIsInvalid = true;
}
private GenotypeCollection(final ArrayList<Genotype> genotypes,
final Map<String, Integer> sampleNameToOffset,
final Set<String> sampleNamesInOrder,
final boolean immutable) {
this.genotypes = genotypes;
this.immutable = immutable;
this.sampleNameToOffset = sampleNameToOffset;
this.sampleNamesInOrder = sampleNamesInOrder;
this.cacheIsInvalid = false;
}
// ---------------------------------------------------------------------------
@ -108,12 +123,8 @@ public class GenotypeCollection implements List<Genotype> {
//
// ---------------------------------------------------------------------------
public final GenotypeCollection mutable() {
immutable = false;
return this;
}
public final GenotypeCollection immutable() {
this.genotypes = Collections.unmodifiableList(genotypes);
immutable = true;
return this;
}
@ -135,17 +146,20 @@ public class GenotypeCollection implements List<Genotype> {
private void invalidateCaches() {
cacheIsInvalid = true;
if ( sampleNameToOffset != null ) sampleNameToOffset.clear();
sampleNamesInOrder = null;
sampleNameToOffset = null;
}
private void buildCache() {
cacheIsInvalid = false;
sampleNamesInOrder = new TreeSet<String>();
sampleNameToOffset = new HashMap<String, Integer>(genotypes.size());
if ( sampleNameToOffset == null )
sampleNameToOffset = new HashMap<String, Integer>(genotypes.size());
for ( int i = 0; i < genotypes.size(); i++ )
sampleNameToOffset.put(genotypes.get(i).getSampleName(), i);
for ( int i = 0; i < genotypes.size(); i++ ) {
final Genotype g = genotypes.get(i);
sampleNamesInOrder.add(g.getSampleName());
sampleNameToOffset.put(g.getSampleName(), i);
}
}
@ -341,7 +355,8 @@ public class GenotypeCollection implements List<Genotype> {
}
public Set<String> getSampleNamesOrderedByName() {
return new TreeSet<String>(getSampleNames());
buildCache();
return sampleNamesInOrder;
}
public boolean containsSample(final String sample) {
@ -365,10 +380,41 @@ public class GenotypeCollection implements List<Genotype> {
return NO_GENOTYPES;
else {
GenotypeCollection subset = create(samples.size());
for ( final Genotype g : genotypes )
if ( samples.contains(g.getSampleName()) )
for ( final Genotype g : genotypes ) {
if ( samples.contains(g.getSampleName()) ) {
subset.add(g);
}
}
return subset;
}
}
@Override
public String toString() {
final List<String> gS = new ArrayList<String>();
for ( final Genotype g : this.iterateInSampleNameOrder() )
gS.add(g.toString());
return "[" + join(",", gS) + "]";
}
// copied from Utils
private static <T> String join(final String separator, final Collection<T> objects) {
if (objects.isEmpty()) { // fast path for empty collection
return "";
} else {
final Iterator<T> iter = objects.iterator();
final T first = iter.next();
if ( ! iter.hasNext() ) // fast path for singleton collections
return first.toString();
else { // full path for 2+ collection that actually need a join
final StringBuilder ret = new StringBuilder(first.toString());
while(iter.hasNext()) {
ret.append(separator);
ret.append(iter.next().toString());
}
return ret.toString();
}
}
}
}

View File

@ -411,7 +411,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
// we need to make this a LinkedHashSet in case the user prefers a given ordering of alleles
this.alleles = makeAlleles(alleles);
if ( genotypes == null ) {
if ( genotypes == null || genotypes == NO_GENOTYPES ) {
this.genotypes = NO_GENOTYPES;
} else {
this.genotypes = genotypes.immutable();
@ -543,8 +543,10 @@ public class VariantContext implements Feature { // to enable tribble intergrati
// }
public VariantContext subContextFromSamples(Set<String> sampleNames, Collection<Allele> alleles) {
loadGenotypes();
GenotypeCollection newGenotypes = genotypes.subsetToSamples(sampleNames);
return new VariantContext(getSource(), getID(), contig, start, stop, alleles,
genotypes.subsetToSamples(sampleNames),
newGenotypes,
getNegLog10PError(),
filtersWereApplied() ? getFilters() : null,
getAttributes(),
@ -552,6 +554,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
}
public VariantContext subContextFromSamples(Set<String> sampleNames) {
loadGenotypes();
GenotypeCollection newGenotypes = genotypes.subsetToSamples(sampleNames);
return new VariantContext(getSource(), getID(), contig, start, stop, allelesOfGenotypes(newGenotypes),
newGenotypes,
@ -562,7 +565,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
}
public VariantContext subContextFromSample(String sampleName) {
return subContextFromSamples(new HashSet<String>(Arrays.asList(sampleName)));
return subContextFromSamples(Collections.singleton(sampleName));
}
/**
@ -1460,7 +1463,9 @@ public class VariantContext implements Feature { // to enable tribble intergrati
public String toString() {
return String.format("[VC %s @ %s of type=%s alleles=%s attr=%s GT=%s",
getSource(), contig + ":" + (start - stop == 0 ? start : start + "-" + stop), this.getType(),
ParsingUtils.sortList(this.getAlleles()), ParsingUtils.sortedString(this.getAttributes()), this.getGenotypesSortedByName());
ParsingUtils.sortList(this.getAlleles()),
ParsingUtils.sortedString(this.getAttributes()),
this.getGenotypes());
}
// protected basic manipulation routines

View File

@ -1,51 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
public class MergeSegregatingAlternateAllelesIntegrationTest extends WalkerTest {
public static String baseTestString(String reference, String VCF, int maxDist) {
return "-T MergeSegregatingAlternateAlleles" +
" -R " + reference +
" --variant:vcf " + validationDataLocation + VCF +
" --maxGenomicDistance " + maxDist +
" -o %s" +
" -NO_HEADER";
}
@Test
public void test1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 1)
+ " -L chr20:556259-756570",
1,
Arrays.asList("af5e1370822551c0c6f50f23447dc627"));
executeTest("Merge sites within genomic distance of 1 [TEST ONE]", spec);
}
@Test
public void test2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 10)
+ " -L chr20:556259-756570",
1,
Arrays.asList("dd8c44ae1ef059a7fe85399467e102eb"));
executeTest("Merge sites within genomic distance of 10 [TEST TWO]", spec);
}
@Test
public void test3() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(hg18Reference, "merging_test_chr20_556259_756570.vcf", 100)
+ " -L chr20:556259-756570",
1,
Arrays.asList("f81fd72ecaa57b3215406fcea860bcc5"));
executeTest("Merge sites within genomic distance of 100 [TEST THREE]", spec);
}
}