From 3016e1cf8071b5205ebaf2d1475254ce96184446 Mon Sep 17 00:00:00 2001 From: delangel Date: Sun, 4 Jul 2010 20:22:04 +0000 Subject: [PATCH] Fixes to increase robustness in vcf4 writer. We assume that only at most 1 base was clipped from beginning of allele encoding by reader, and improve the way we find if bases were clipped. We still cant deal with some corner cases, and duplicate records may follow, for example if a snp location is followed at the next base by an indel. Also, if we are reading form a 3.3 vcf and the reference is null (ie we have an insertion), the reference base is not computed correctly. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3717 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/utils/genotype/vcf/VCFWriter.java | 100 +++++++++--------- 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index 65ff88cab..4f7da63b7 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -243,70 +243,72 @@ public class VCFWriter { // TODO- clean up these flags and associated code boolean filtersWereAppliedToContext = true; List allowedGenotypeAttributeKeys = null; - boolean filtersWereAppliedToGenotypes = true; + String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? PASSES_FILTERS_STRING : UNFILTERED); Map alleleMap = new HashMap(); alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup List vcfAltAlleles = new ArrayList(); - int refIndex=0, numTrailingBases = 0, numPaddingBases = 0; - String paddingBases = new String(""); - String trailingBases = new String(""); + int numTrailingBases = 0, numPaddingBases = 0; + String paddingBases = ""; + String trailingBases = ""; ArrayList originalAlleles = (ArrayList)vc.getAttribute("ORIGINAL_ALLELE_LIST"); + // search for reference allele and find trailing and padding at the end. + // See first if all alleles have a base encoding (ie no deletions) + // if so, add one common base to all alleles (reference at this location) + boolean hasBasesInAllAlleles = true; + String refString = null; + for ( Allele a : vc.getAlleles() ) { + if (a.isNull() || a.isNoCall()) + hasBasesInAllAlleles = false; + + if (a.isReference()) + refString = new String(a.getBases()); + } + if (originalAlleles != null) { + // if original allele info was filled when reading from a VCF4 file, + // determine whether there was a padding base(s) at the beginning and end. + byte previousBase = 0; + int cnt=0; + boolean firstBaseCommonInAllAlleles = true; Allele originalReferenceAllele = null; - for (refIndex=0; refIndex < originalAlleles.size(); refIndex++) { - originalReferenceAllele = originalAlleles.get(refIndex); - if (originalReferenceAllele.isReference()) - break; + for (Allele originalAllele : originalAlleles){ + // if first base of allele is common to all of them, there may have been a common base deleted from all + byte firstBase = originalAllele.getBases()[0]; + if (cnt > 0) { + if (firstBase != previousBase) + firstBaseCommonInAllAlleles = false; + } + previousBase = firstBase; + cnt++; + + if (originalAllele.isReference()) + originalReferenceAllele = originalAllele; } + + numTrailingBases = (firstBaseCommonInAllAlleles)? 1:0; + position -= numTrailingBases; + if (originalReferenceAllele == null) throw new IllegalStateException("At least one Allele must be reference"); - for ( Allele a : vc.getAlleles() ) { - if (a.isNonReference()) - continue; - - String refString = new String(a.getBases()); - String originalRef = new String(originalReferenceAllele.getBases()); - if (refString.equals("")) { - // special case with null reference: - // for example, null reference, originalRef = "AC", alt = "AGC". - // In this case, we'd have one trailing and one padding base, with alleles as {-,G}. - // TEMP fix: assume in this case one trailing base, more general solution would be to - // scan through all alleles and look for maximum common subsequence at beginning and at end for all alleles - numTrailingBases = 1; - } else - numTrailingBases = originalRef.lastIndexOf(refString); - - numPaddingBases = originalRef.length()-refString.length()-numTrailingBases; - if (numTrailingBases > 0) { - - position -= numTrailingBases; - trailingBases = originalRef.substring(0,numTrailingBases); - } - if (numPaddingBases > 0) - paddingBases = originalRef.substring(originalRef.length()-numPaddingBases,originalRef.length()); + String originalRef = new String(originalReferenceAllele.getBases()); + numPaddingBases = originalRef.length()-refString.length()-numTrailingBases; + if (numTrailingBases > 0) { + trailingBases = originalRef.substring(0,numTrailingBases); } - - } else { - // no original Allele information: see first if all alleles have a base encoding (ie no deletions) - // if so, add one common base to all alleles (reference at this location) - boolean hasBasesInAllAlleles = true; - for ( Allele a : vc.getAlleles() ) { - String alleleString = new String(a.getBases()); - if (alleleString.length()==0) { - hasBasesInAllAlleles = false; - break; - } - } - + if (numPaddingBases > 0) + paddingBases = originalRef.substring(originalRef.length()-numPaddingBases,originalRef.length()); + } + else { + // Case where there is no original allele info, e.g. when reading from VCF3.3 or when vc was produced by the GATK. if (!hasBasesInAllAlleles) { trailingBases = new String(refBases); numTrailingBases = 1; @@ -314,6 +316,7 @@ public class VCFWriter { } } + for ( Allele a : vc.getAlleles() ) { VCFGenotypeEncoding encoding; @@ -342,8 +345,7 @@ public class VCFWriter { if ( allowedGenotypeAttributeKeys == null || allowedGenotypeAttributeKeys.contains(key) ) vcfGenotypeAttributeKeys.add(key); } - if ( filtersWereAppliedToGenotypes ) // todo -- should be calculated - vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_FILTER_KEY); + } String genotypeFormatString = Utils.join(GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys); @@ -497,7 +499,7 @@ public class VCFWriter { * @param vcfAltAlleles alternate alleles at this site */ private void addGenotypeData(StringBuilder builder, VCFHeader header, - String genotypeFormatString, ListvcfAltAlleles) { + String genotypeFormatString, ListvcfAltAlleles) { Map gMap = genotypeListToMap(mGenotypeRecords); StringBuffer tempStr = new StringBuffer(); @@ -559,7 +561,7 @@ public class VCFWriter { } return isEmpty; - + } /** * create a genotype mapping from a list and their sample names