clip to the null allele on the reference string in VCF 4, instead of stopping to perserve one reference base.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3613 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b5df2705c9
commit
9872b65803
|
|
@ -409,6 +409,13 @@ public class VCF4Codec implements FeatureCodec {
|
|||
static Pair<GenomeLoc,List<Allele>> clipAlleles(String contig, long position, String ref, List<Allele> unclippedAlleles) {
|
||||
List<Allele> newAlleleList = new ArrayList<Allele>();
|
||||
|
||||
// find the smallest allele
|
||||
int minimumAllele = (unclippedAlleles.size() > 0) ? unclippedAlleles.get(0).length() : Integer.MAX_VALUE;
|
||||
for (Allele smallest : unclippedAlleles)
|
||||
if (smallest.length() < minimumAllele)
|
||||
minimumAllele = smallest.length();
|
||||
|
||||
|
||||
// find the preceeding string common to all alleles and the reference
|
||||
int forwardClipped = 0;
|
||||
boolean clipping = true;
|
||||
|
|
@ -434,8 +441,8 @@ public class VCF4Codec implements FeatureCodec {
|
|||
}
|
||||
|
||||
// check to see if we're about to clip all the bases from the reference, if so back off the front clip a base
|
||||
if (forwardClipped + reverseClipped >= ref.length())
|
||||
forwardClipped--;
|
||||
//if (forwardClipped + reverseClipped >= ref.length() || forwardClipped + reverseClipped >= minimumAllele)
|
||||
// forwardClipped--;
|
||||
|
||||
for (Allele a : unclippedAlleles)
|
||||
newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipped,a.getBases().length-reverseClipped),a.isReference()));
|
||||
|
|
|
|||
|
|
@ -193,9 +193,9 @@ public class VCF4UnitTest extends BaseTest {
|
|||
testSetup.codec.decode(twoManyInfoLine);
|
||||
}
|
||||
|
||||
/* File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
|
||||
File largeVCF = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesTable1/dindel-v2/CEU.low_coverage.2010_06.indel.genotypes.vcf");
|
||||
|
||||
@Test
|
||||
//@Test
|
||||
public void checkLargeVCF() {
|
||||
TestSetup testSetup = new TestSetup().invoke(largeVCF);
|
||||
AsciiLineReader reader = testSetup.getReader();
|
||||
|
|
@ -212,6 +212,7 @@ public class VCF4UnitTest extends BaseTest {
|
|||
// our record count
|
||||
int recordCount = 0;
|
||||
int badRecordCount = 0;
|
||||
long milliseconds = System.currentTimeMillis();
|
||||
while (line != null) {
|
||||
try {
|
||||
recordCount++;
|
||||
|
|
@ -229,10 +230,51 @@ public class VCF4UnitTest extends BaseTest {
|
|||
Assert.fail("Failed to read a line");
|
||||
}
|
||||
}
|
||||
System.err.println("Total time = " + (System.currentTimeMillis() - milliseconds));
|
||||
Assert.assertEquals(0,badRecordCount);
|
||||
Assert.assertEquals(728075,recordCount);
|
||||
}
|
||||
*/
|
||||
|
||||
//@Test
|
||||
public void checkBobsCNVVCF() {
|
||||
TestSetup testSetup = new TestSetup().invoke(new File("bobs.vcf"));
|
||||
AsciiLineReader reader = testSetup.getReader();
|
||||
VCF4Codec codec = testSetup.getCodec();
|
||||
|
||||
// now parse the lines
|
||||
String line = null;
|
||||
try {
|
||||
line = reader.readLine();
|
||||
} catch (IOException e) {
|
||||
Assert.fail("Failed to read a line");
|
||||
}
|
||||
|
||||
// our record count
|
||||
int recordCount = 0;
|
||||
int badRecordCount = 0;
|
||||
long milliseconds = System.currentTimeMillis();
|
||||
while (line != null) {
|
||||
try {
|
||||
recordCount++;
|
||||
try {
|
||||
testSetup.codec.decode(line);
|
||||
} catch (Exception e) {
|
||||
System.err.println(e.getMessage() + " -> " + line);
|
||||
System.err.println(line);
|
||||
badRecordCount++;
|
||||
}
|
||||
line = reader.readLine();
|
||||
if (recordCount % 1000 == 0)
|
||||
System.err.println("record count == " + recordCount);
|
||||
} catch (IOException e) {
|
||||
Assert.fail("Failed to read a line");
|
||||
}
|
||||
}
|
||||
System.err.println("Total time = " + (System.currentTimeMillis() - milliseconds));
|
||||
Assert.assertEquals(0,badRecordCount);
|
||||
Assert.assertEquals(15947,recordCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||
*/
|
||||
|
|
@ -278,6 +320,27 @@ public class VCF4UnitTest extends BaseTest {
|
|||
Assert.assertTrue(locAndList.second.get(2).toString().equals("G"));
|
||||
}
|
||||
|
||||
/**
|
||||
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||
*/
|
||||
@Test
|
||||
public void testClippingOfAllelesLongRefRepeatClippable() {
|
||||
String ref = "GGGGG";
|
||||
ArrayList<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add(Allele.create(ref,true));
|
||||
alleles.add(Allele.create("GG",false));
|
||||
alleles.add(Allele.create("GGG",false));
|
||||
|
||||
Pair<GenomeLoc, List<Allele>> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles);
|
||||
Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",3,5)));
|
||||
|
||||
// we know the ordering
|
||||
Assert.assertTrue(locAndList.second.get(0).toString().equals("GGG*"));
|
||||
Assert.assertTrue(locAndList.second.get(0).isReference());
|
||||
Assert.assertTrue(locAndList.second.get(1).toString().equals("-"));
|
||||
Assert.assertTrue(locAndList.second.get(2).toString().equals("G"));
|
||||
}
|
||||
|
||||
/**
|
||||
* test out the clipping of alleles (removing extra context provided by VCF implementation).
|
||||
*/
|
||||
|
|
|
|||
Loading…
Reference in New Issue