adding the original allele list to a variant context (as the annotation ORIGINAL_ALLELE_LIST), in the case where the set alleles are the result of clipping. Added tests for both cases.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3658 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-06-28 17:23:46 +00:00
parent 1292c96e29
commit 62d22ff1aa
2 changed files with 39 additions and 8 deletions

View File

@ -25,6 +25,9 @@ import java.util.*;
*/
public class VCF4Codec implements FeatureCodec {
// a variant context flag for original allele strings
public static final String ORIGINAL_ALLELE_LIST = "ORIGINAL_ALLELE_LIST";
// we have to store the list of strings that make up the header until they're needed
private VCFHeader header = null;
@ -364,6 +367,7 @@ public class VCF4Codec implements FeatureCodec {
*/
private VariantContext parseVCFLine(String[] parts, boolean parseGenotypes) {
try {
// increment the line count
lineNo++;
// parse out the required fields
@ -376,15 +380,19 @@ public class VCF4Codec implements FeatureCodec {
String filter = parts[6];
String info = parts[7];
// get our alleles, filters, and setup an attribute map
List<Allele> alleles = parseAlleles(ref, alts);
Set<String> filters = parseFilters(filter);
Map<String, Object> attributes = parseInfo(info, id);
// find out our current location, and clip the alleles down to their minimum length
Pair<GenomeLoc, List<Allele>> locAndAlleles = (ref.length() > 1) ?
clipAlleles(contig, pos, ref, alleles) :
new Pair<GenomeLoc, List<Allele>>(GenomeLocParser.createGenomeLoc(contig, pos), alleles);
Pair<GenomeLoc, List<Allele>> locAndAlleles;
if (ref.length() > 1) {
attributes.put(ORIGINAL_ALLELE_LIST,alleles);
locAndAlleles = clipAlleles(contig, pos, ref, alleles);
} else {
locAndAlleles = new Pair<GenomeLoc, List<Allele>>(GenomeLocParser.createGenomeLoc(contig, pos), alleles);
}
// a map to store our genotypes
Map<String, Genotype> genotypes = null;
@ -477,7 +485,11 @@ public class VCF4Codec implements FeatureCodec {
/**
* clip the alleles, based on the reference
* @param unclippedAlleles the list of alleles
*
* @param contig our contig position
* @param position the unadjusted start position (pre-clipping)
* @param ref the reference string
* @param unclippedAlleles the list of unclipped alleles
* @return a list of alleles, clipped to the reference
*/
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
@ -502,7 +514,6 @@ public class VCF4Codec implements FeatureCodec {
if (clipping) reverseClipped++;
}
for (Allele a : unclippedAlleles)
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference()));

View File

@ -41,7 +41,7 @@ public class VCF4UnitTest extends BaseTest {
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
GenomeLocParser.setupRefContigOrdering(seq);
GenomeLocParser.setupRefContigOrdering(seq.getSequenceDictionary());
}
@Test
@ -197,13 +197,33 @@ public class VCF4UnitTest extends BaseTest {
}
// test too few info lines, we don't provide the DP in this line
String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t0|0:48:1:51,51\t0|0:48:1:51,51";
String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t0|0:48:1:51,51\t0|0:48:1:51,51";
@Test
public void testCheckTwoFewInfoValidation() {
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
testSetup.codec.decode(twoFewInfoLine);
}
// test that the variant context adds attributes for the original alleles when clipped
String clippedAlleleLine = "20\t14370\trs6054257\tGGG\tG\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|1:48:1:51,51\t0|0:48:1:51,51\t0|0:48:1:51,51";
@Test
public void testClippedAllelesAddedAsAnnotation() {
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
VariantContext context = (VariantContext)testSetup.codec.decode(clippedAlleleLine);
Assert.assertTrue(context.hasAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST));
List<Allele> alleles = (List<Allele>)context.getAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST);
Assert.assertEquals("Expected allele list of 2, got " + alleles.size(),2,alleles.size());
Assert.assertTrue(alleles.get(0).basesMatch("GGG"));
Assert.assertTrue(alleles.get(1).basesMatch("G"));
}
// test that when we don't clip the alleles, we don't see the annotation
@Test
public void testNoClippedAllelesNoAddedAsAnnotation() {
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
VariantContext context = (VariantContext)testSetup.codec.decode(twoFewInfoLine);
Assert.assertTrue(!context.hasAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST));
}
// test that we're getting the right genotype for a multi-base polymorphism
String MNPLine = "20\t14370\trs6054257\tGG\tAT\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
@Test