Working implementation of DecodeLoc for VCF parsing. Makes indexing 3x faster.
This commit is contained in:
parent
518b3dd291
commit
2c24b68a96
|
|
@ -154,9 +154,45 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
||||||
* @return a feature, (not guaranteed complete) that has the correct start and stop
|
* @return a feature, (not guaranteed complete) that has the correct start and stop
|
||||||
*/
|
*/
|
||||||
public Feature decodeLoc(String line) {
|
public Feature decodeLoc(String line) {
|
||||||
return reallyDecode(line);
|
String[] locParts = new String[6];
|
||||||
|
ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
|
||||||
|
|
||||||
|
// get our alleles (because the end position depends on them)
|
||||||
|
String ref = getCachedString(locParts[3].toUpperCase());
|
||||||
|
String alts = getCachedString(locParts[4].toUpperCase());
|
||||||
|
List<Allele> alleles = parseAlleles(ref, alts, lineNo);
|
||||||
|
|
||||||
|
// find out our location
|
||||||
|
int start = Integer.valueOf(locParts[1]);
|
||||||
|
int stop = start;
|
||||||
|
|
||||||
|
// ref alleles don't need to be single bases for monomorphic sites
|
||||||
|
if ( alleles.size() == 1 ) {
|
||||||
|
stop = start + alleles.get(0).length() - 1;
|
||||||
|
} else if ( !isSingleNucleotideEvent(alleles) ) {
|
||||||
|
stop = clipAlleles(start, ref, alleles, null, lineNo);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new VCFLocFeature(locParts[0], start, stop);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private final static class VCFLocFeature implements Feature {
|
||||||
|
|
||||||
|
final String chr;
|
||||||
|
final int start, stop;
|
||||||
|
|
||||||
|
private VCFLocFeature(String chr, int start, int stop) {
|
||||||
|
this.chr = chr;
|
||||||
|
this.start = start;
|
||||||
|
this.stop = stop;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getChr() { return chr; }
|
||||||
|
public int getStart() { return start; }
|
||||||
|
public int getEnd() { return stop; }
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* decode the line into a feature (VariantContext)
|
* decode the line into a feature (VariantContext)
|
||||||
* @param line the line
|
* @param line the line
|
||||||
|
|
@ -207,7 +243,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
||||||
|
|
||||||
// parse out the required fields
|
// parse out the required fields
|
||||||
String contig = getCachedString(parts[0]);
|
String contig = getCachedString(parts[0]);
|
||||||
long pos = Long.valueOf(parts[1]);
|
int pos = Integer.valueOf(parts[1]);
|
||||||
String id = null;
|
String id = null;
|
||||||
if ( parts[2].length() == 0 )
|
if ( parts[2].length() == 0 )
|
||||||
generateException("The VCF specification requires a valid ID field");
|
generateException("The VCF specification requires a valid ID field");
|
||||||
|
|
@ -227,7 +263,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
||||||
Map<String, Object> attributes = parseInfo(info, id);
|
Map<String, Object> attributes = parseInfo(info, id);
|
||||||
|
|
||||||
// find out our current location, and clip the alleles down to their minimum length
|
// find out our current location, and clip the alleles down to their minimum length
|
||||||
long loc = pos;
|
int loc = pos;
|
||||||
// ref alleles don't need to be single bases for monomorphic sites
|
// ref alleles don't need to be single bases for monomorphic sites
|
||||||
if ( alleles.size() == 1 ) {
|
if ( alleles.size() == 1 ) {
|
||||||
loc = pos + alleles.get(0).length() - 1;
|
loc = pos + alleles.get(0).length() - 1;
|
||||||
|
|
@ -506,9 +542,9 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
||||||
* @param ref the reference string
|
* @param ref the reference string
|
||||||
* @param unclippedAlleles the list of unclipped alleles
|
* @param unclippedAlleles the list of unclipped alleles
|
||||||
* @param clippedAlleles output list of clipped alleles
|
* @param clippedAlleles output list of clipped alleles
|
||||||
* @return a list of alleles, clipped to the reference
|
* @return the new reference end position of this event
|
||||||
*/
|
*/
|
||||||
protected static long clipAlleles(long position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
|
protected static int clipAlleles(int position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
|
||||||
|
|
||||||
// Note that the computation of forward clipping here is meant only to see whether there is a common
|
// Note that the computation of forward clipping here is meant only to see whether there is a common
|
||||||
// base to all alleles, and to correctly compute reverse clipping,
|
// base to all alleles, and to correctly compute reverse clipping,
|
||||||
|
|
@ -534,11 +570,13 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
|
||||||
if (clipping) reverseClipped++;
|
if (clipping) reverseClipped++;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Allele a : unclippedAlleles) {
|
if ( clippedAlleles != null ) {
|
||||||
if (a.isSymbolic()) {
|
for ( Allele a : unclippedAlleles ) {
|
||||||
clippedAlleles.add(a);
|
if ( a.isSymbolic() ) {
|
||||||
} else {
|
clippedAlleles.add(a);
|
||||||
clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(),0,a.getBases().length-reverseClipped),a.isReference()));
|
} else {
|
||||||
|
clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(),0,a.getBases().length-reverseClipped),a.isReference()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue