Committing the first version of the homopolymer filter. Removes SNPs that occur at the edges of homopolymer runs and whose nonref allele matches the repeated base in the homopolymer.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1373 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
mmelgar 2009-08-04 14:35:51 +00:00
parent 20ff603339
commit 939b19e715
1 changed files with 117 additions and 0 deletions

View File

@ -0,0 +1,117 @@
package org.broadinstitute.sting.playground.gatk.walkers.variants;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.rodVariants;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.io.File;
import java.io.IOException;
import net.sf.picard.reference.ReferenceSequence;
/**
* Created by IntelliJ IDEA.
* User: mmelgar
* Date: Jul 28, 2009
* Time: 3:59:38 PM
* To change this template use File | Settings | File Templates.
*/
public class VECHomopolymer implements VariantExclusionCriterion {
private int extent = 5;
private float frac = 0.8f;
public IndexedFastaSequenceFile refSeq;
private String contig;
public ReferenceSequence contigSeq;
private boolean exclude = false;
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
String[] argPieces = arguments.split(",");
extent = Integer.valueOf(argPieces[0]);
frac = Float.valueOf(argPieces[1]);
}
File refFile = new File ("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta");
try {
refSeq = new IndexedFastaSequenceFile(refFile);
} catch (IOException e) {
refSeq = null;
}
}
public boolean useZeroQualityReads() { return false; }
public String getStudyHeader() {
return "";
}
public String getStudyInfo() {
return "";
}
public void compute(char ref, LocusContext context, rodVariants variant) {
char[] geno = variant.getBestGenotype().toCharArray();
int[] genotype = {-1,-1};
for (int h = 0; h < 2; h ++) {
genotype[h] = BaseUtils.simpleBaseToBaseIndex(geno[h]);
}
if (contig == null || !context.getContig().equals(contig)) {
contig = context.getContig();
contigSeq = refSeq.getSequence(contig);
}
String bases = new String(contigSeq.getBases());
String prevbases = bases.substring((int) (context.getPosition() - extent - 1), (int) (context.getPosition() - 1));
String nextbases = bases.substring((int) (context.getPosition() + 1), (int) (context.getPosition() + extent + 1));
int[] prevCounts = {0,0,0,0};
int[] nextCounts = {0,0,0,0};
for (int i = 0; i < extent; i ++) {
int prevBaseIndex = BaseUtils.simpleBaseToBaseIndex(prevbases.toCharArray()[i]);
int nextBaseIndex = BaseUtils.simpleBaseToBaseIndex(nextbases.toCharArray()[i]);
if (prevBaseIndex != -1) {
prevCounts[prevBaseIndex] ++;
}
if (nextBaseIndex != -1) {
nextCounts[nextBaseIndex] ++;
}
}
int prevMaxBase = 0;
int nextMaxBase = 0;
int prevMax = prevCounts[0];
int nextMax = nextCounts[0];
for (int j = 1; j < 4; j ++) {
if (prevCounts[j] > prevMax) {
prevMax = prevCounts[j];
prevMaxBase = j;
}
if (nextCounts[j] > nextMax) {
nextMax = nextCounts[j];
nextMaxBase = j;
}
}
int[] homopolymer = {-1,-1};
if ((float)(prevMax)/(float)(extent) > frac) {homopolymer[0] = prevMaxBase;}
if ((float)(nextMax)/(float)(extent) > frac) {homopolymer[1] = nextMaxBase;}
boolean checkOne = homopolymer[0] == genotype[0] && homopolymer[0] != BaseUtils.simpleBaseToBaseIndex(ref);
boolean checkTwo = homopolymer[0] == genotype[1] && homopolymer[0] != BaseUtils.simpleBaseToBaseIndex(ref);
boolean checkThree = homopolymer[1] == genotype[0] && homopolymer[1] != BaseUtils.simpleBaseToBaseIndex(ref);
boolean checkFour = homopolymer[1] == genotype[1] && homopolymer[1] != BaseUtils.simpleBaseToBaseIndex(ref);
exclude = checkOne || checkTwo || checkThree || checkFour;
}
public boolean isExcludable() {
return exclude;
}
}