adding the original allele list to a variant context (as the annotation ORIGINAL_ALLELE_LIST), in the case where the set alleles are the result of clipping. Added tests for both cases.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3658 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
1292c96e29
commit
62d22ff1aa
|
|
@ -25,6 +25,9 @@ import java.util.*;
|
||||||
*/
|
*/
|
||||||
public class VCF4Codec implements FeatureCodec {
|
public class VCF4Codec implements FeatureCodec {
|
||||||
|
|
||||||
|
// a variant context flag for original allele strings
|
||||||
|
public static final String ORIGINAL_ALLELE_LIST = "ORIGINAL_ALLELE_LIST";
|
||||||
|
|
||||||
// we have to store the list of strings that make up the header until they're needed
|
// we have to store the list of strings that make up the header until they're needed
|
||||||
private VCFHeader header = null;
|
private VCFHeader header = null;
|
||||||
|
|
||||||
|
|
@ -364,6 +367,7 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
*/
|
*/
|
||||||
private VariantContext parseVCFLine(String[] parts, boolean parseGenotypes) {
|
private VariantContext parseVCFLine(String[] parts, boolean parseGenotypes) {
|
||||||
try {
|
try {
|
||||||
|
// increment the line count
|
||||||
lineNo++;
|
lineNo++;
|
||||||
|
|
||||||
// parse out the required fields
|
// parse out the required fields
|
||||||
|
|
@ -376,15 +380,19 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
String filter = parts[6];
|
String filter = parts[6];
|
||||||
String info = parts[7];
|
String info = parts[7];
|
||||||
|
|
||||||
|
// get our alleles, filters, and setup an attribute map
|
||||||
List<Allele> alleles = parseAlleles(ref, alts);
|
List<Allele> alleles = parseAlleles(ref, alts);
|
||||||
Set<String> filters = parseFilters(filter);
|
Set<String> filters = parseFilters(filter);
|
||||||
Map<String, Object> attributes = parseInfo(info, id);
|
Map<String, Object> attributes = parseInfo(info, id);
|
||||||
|
|
||||||
// find out our current location, and clip the alleles down to their minimum length
|
// find out our current location, and clip the alleles down to their minimum length
|
||||||
Pair<GenomeLoc, List<Allele>> locAndAlleles = (ref.length() > 1) ?
|
Pair<GenomeLoc, List<Allele>> locAndAlleles;
|
||||||
clipAlleles(contig, pos, ref, alleles) :
|
if (ref.length() > 1) {
|
||||||
new Pair<GenomeLoc, List<Allele>>(GenomeLocParser.createGenomeLoc(contig, pos), alleles);
|
attributes.put(ORIGINAL_ALLELE_LIST,alleles);
|
||||||
|
locAndAlleles = clipAlleles(contig, pos, ref, alleles);
|
||||||
|
} else {
|
||||||
|
locAndAlleles = new Pair<GenomeLoc, List<Allele>>(GenomeLocParser.createGenomeLoc(contig, pos), alleles);
|
||||||
|
}
|
||||||
|
|
||||||
// a map to store our genotypes
|
// a map to store our genotypes
|
||||||
Map<String, Genotype> genotypes = null;
|
Map<String, Genotype> genotypes = null;
|
||||||
|
|
@ -477,7 +485,11 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* clip the alleles, based on the reference
|
* clip the alleles, based on the reference
|
||||||
* @param unclippedAlleles the list of alleles
|
*
|
||||||
|
* @param contig our contig position
|
||||||
|
* @param position the unadjusted start position (pre-clipping)
|
||||||
|
* @param ref the reference string
|
||||||
|
* @param unclippedAlleles the list of unclipped alleles
|
||||||
* @return a list of alleles, clipped to the reference
|
* @return a list of alleles, clipped to the reference
|
||||||
*/
|
*/
|
||||||
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
|
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
|
||||||
|
|
@ -502,7 +514,6 @@ public class VCF4Codec implements FeatureCodec {
|
||||||
if (clipping) reverseClipped++;
|
if (clipping) reverseClipped++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
for (Allele a : unclippedAlleles)
|
for (Allele a : unclippedAlleles)
|
||||||
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference()));
|
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference()));
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,7 @@ public class VCF4UnitTest extends BaseTest {
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
throw new StingException("unable to load the sequence dictionary");
|
throw new StingException("unable to load the sequence dictionary");
|
||||||
}
|
}
|
||||||
GenomeLocParser.setupRefContigOrdering(seq);
|
GenomeLocParser.setupRefContigOrdering(seq.getSequenceDictionary());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
@ -197,13 +197,33 @@ public class VCF4UnitTest extends BaseTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
// test too few info lines, we don't provide the DP in this line
|
// test too few info lines, we don't provide the DP in this line
|
||||||
String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t0|0:48:1:51,51\t0|0:48:1:51,51";
|
String twoFewInfoLine = "20\t14370\trs6054257\tG\tA\t29\tPASS\tNS=3;AF=0.5;DB\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t0|0:48:1:51,51\t0|0:48:1:51,51";
|
||||||
@Test
|
@Test
|
||||||
public void testCheckTwoFewInfoValidation() {
|
public void testCheckTwoFewInfoValidation() {
|
||||||
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||||
testSetup.codec.decode(twoFewInfoLine);
|
testSetup.codec.decode(twoFewInfoLine);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// test that the variant context adds attributes for the original alleles when clipped
|
||||||
|
String clippedAlleleLine = "20\t14370\trs6054257\tGGG\tG\t29\tPASS\tNS=3;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|1:48:1:51,51\t0|0:48:1:51,51\t0|0:48:1:51,51";
|
||||||
|
@Test
|
||||||
|
public void testClippedAllelesAddedAsAnnotation() {
|
||||||
|
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||||
|
VariantContext context = (VariantContext)testSetup.codec.decode(clippedAlleleLine);
|
||||||
|
Assert.assertTrue(context.hasAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST));
|
||||||
|
List<Allele> alleles = (List<Allele>)context.getAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST);
|
||||||
|
Assert.assertEquals("Expected allele list of 2, got " + alleles.size(),2,alleles.size());
|
||||||
|
Assert.assertTrue(alleles.get(0).basesMatch("GGG"));
|
||||||
|
Assert.assertTrue(alleles.get(1).basesMatch("G"));
|
||||||
|
}
|
||||||
|
// test that when we don't clip the alleles, we don't see the annotation
|
||||||
|
@Test
|
||||||
|
public void testNoClippedAllelesNoAddedAsAnnotation() {
|
||||||
|
TestSetup testSetup = new TestSetup().invoke(vcfGenotypeFile);
|
||||||
|
VariantContext context = (VariantContext)testSetup.codec.decode(twoFewInfoLine);
|
||||||
|
Assert.assertTrue(!context.hasAttribute(VCF4Codec.ORIGINAL_ALLELE_LIST));
|
||||||
|
}
|
||||||
|
|
||||||
// test that we're getting the right genotype for a multi-base polymorphism
|
// test that we're getting the right genotype for a multi-base polymorphism
|
||||||
String MNPLine = "20\t14370\trs6054257\tGG\tAT\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
String MNPLine = "20\t14370\trs6054257\tGG\tAT\t29\tPASS\tNS=3;DP=14;AF=0.5;DB;H2\tGT:GQ:DP:HQ\t0|0:48:1:51,51\t1|0:48:8:51,51\t1/1:43:5:.,.";
|
||||||
@Test
|
@Test
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue