a) Fixed md5 for legit change in UG output that now also no-calls genotypes w/0,0,0 in PL's in SNP case.

b) First reimplementation of new vc merger of different types. Previous version did it in two steps, first merging all vc's per type and then trying to see if resulting vc's would be merged if alleles of one type were a subset of another, but this won't work when uniquifying genotypes since sample names would be messed up and GT sample names wouldn't match VC sample names. Now, it's actually simpler: when splitting vc's by type before merging, we check for alleles of one vc being a subset of alleles of vc of another type and if so we put them together in same list.
This commit is contained in:
Guillermo del Angel 2011-09-24 13:40:11 -04:00
parent 3a4469a236
commit cd058dd10f
3 changed files with 43 additions and 37 deletions

View File

@ -234,47 +234,17 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN))
return 0;
List<VariantContext> preMergedVCs = new ArrayList<VariantContext>();
List<VariantContext> mergedVCs = new ArrayList<VariantContext>();
Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
// iterate over the types so that it's deterministic
for ( VariantContext.Type type : VariantContext.Type.values() ) {
if ( VCsByType.containsKey(type) )
preMergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
}
List<VariantContext> mergedVCs = new ArrayList<VariantContext>();
// se have records merged but separated by type. If a particular record is for example a snp but all alleles are a subset of an existing mixed record,
// we will still merge those records.
if (preMergedVCs.size() > 1) {
for (VariantContext vc1 : preMergedVCs) {
VariantContext newvc = vc1;
boolean merged = false;
for (int k=0; k < mergedVCs.size(); k++) {
VariantContext vc2 = mergedVCs.get(k);
if (VariantContextUtils.allelesAreSubset(vc1,vc2) || VariantContextUtils.allelesAreSubset(vc2,vc1)) {
// all alleles of vc1 are contained in vc2 but they are of different type (say, vc1 is snp, vc2 is complex): try to merget v1 into v2
List<VariantContext> vcpair = new ArrayList<VariantContext>();
vcpair.add(vc1);
vcpair.add(vc2);
newvc = VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcpair,
priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC);
mergedVCs.set(k,newvc);
merged = true;
break;
}
}
if (!merged)
mergedVCs.add(vc1);
}
}
else {
mergedVCs = preMergedVCs;
}
for ( VariantContext mergedVC : mergedVCs ) {
for ( VariantContext mergedVC : mergedVCs ) {
// only operate at the start of events
if ( mergedVC == null )
continue;

View File

@ -758,9 +758,45 @@ public class VariantContextUtils {
public static Map<VariantContext.Type, List<VariantContext>> separateVariantContextsByType(Collection<VariantContext> VCs) {
HashMap<VariantContext.Type, List<VariantContext>> mappedVCs = new HashMap<VariantContext.Type, List<VariantContext>>();
for ( VariantContext vc : VCs ) {
if ( !mappedVCs.containsKey(vc.getType()) )
mappedVCs.put(vc.getType(), new ArrayList<VariantContext>());
mappedVCs.get(vc.getType()).add(vc);
// look at previous variant contexts of different type. If:
// a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list
// b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list)
// c) neither: do nothing, just add vc to its own list
boolean addtoOwnList = true;
for (VariantContext.Type type : VariantContext.Type.values()) {
if (type.equals(vc.getType()))
continue;
if (!mappedVCs.containsKey(type))
continue;
List<VariantContext> vcList = mappedVCs.get(type);
for (int k=0; k < vcList.size(); k++) {
VariantContext otherVC = vcList.get(k);
if (allelesAreSubset(otherVC,vc)) {
// otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list
vcList.remove(k);
// avoid having empty lists
if (vcList.size() == 0)
mappedVCs.remove(vcList);
if ( !mappedVCs.containsKey(vc.getType()) )
mappedVCs.put(vc.getType(), new ArrayList<VariantContext>());
mappedVCs.get(vc.getType()).add(otherVC);
break;
}
else if (allelesAreSubset(vc,otherVC)) {
// vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own
mappedVCs.get(type).add(vc);
addtoOwnList = false;
}
}
}
if (addtoOwnList) {
if ( !mappedVCs.containsKey(vc.getType()) )
mappedVCs.put(vc.getType(), new ArrayList<VariantContext>());
mappedVCs.get(vc.getType()).add(vc);
}
}
return mappedVCs;

View File

@ -42,7 +42,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("ec43daadfb15b00b41aeb0017a45df0b"));
Arrays.asList("6458f3b8fe4954e2ffc2af972aaab19e"));
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
}