First fully functional implementation of ValidationSiteSelectorWalker. User gives a) a set of input variants, b) a desired number of output variants, b) Optionally, a set of samples which will restrict sites to be polymorphic in those samples, c) a frequency selection mode: either uniform (no AF matching), or matching AF so that output sites mirror the input AF spectrum as closely as possible.

More testing is needed and docs need improving but so far all functionality seems up and running
This commit is contained in:
Guillermo del Angel 2011-10-27 20:53:48 -04:00
parent f7df8bdecc
commit 8907e42007
1 changed files with 2 additions and 84 deletions

View File

@ -255,14 +255,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false)
private boolean KEEP_ORIGINAL_CHR_COUNTS = false;
@Hidden
@Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false)
private boolean KEEP_AF_SPECTRUM = false;
@Hidden
@Argument(fullName="afFile", shortName="afFile", doc="The output recal file used by ApplyRecalibration", required=false)
private File AF_FILE = new File("");
@Hidden
@Argument(fullName="family_structure_file", shortName="familyFile", doc="use -family unless you know what you're doing", required=false)
private File FAMILY_STRUCTURE_FILE = null;
@ -442,7 +434,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
mvSet.add(mv);
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(AF_FILE, e);
throw new UserException.CouldNotReadInputFile(FAMILY_STRUCTURE_FILE, e);
}
if (outMVFile != null)
try {
@ -469,31 +461,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
if (KEEP_AF_SPECTRUM) {
try {
afBreakpoints = new ArrayList<Double>();
afBoosts = new ArrayList<Double>();
logger.info("Reading in AF boost table...");
boolean firstLine = false;
for ( final String line : new XReadLines( AF_FILE ) ) {
if (!firstLine) {
firstLine = true;
continue;
}
final String[] vals = line.split(" ");
double bkp = Double.valueOf(vals[0]);
double afb = Double.valueOf(vals[1]);
afBreakpoints.add(bkp);
afBoosts.add(afb);
}
bkDelta = afBreakpoints.get(0);
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(AF_FILE, e);
}
}
}
/**
@ -566,61 +534,11 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
if (SELECT_RANDOM_NUMBER) {
randomlyAddVariant(++variantNumber, sub, ref.getBase());
}
else if (!SELECT_RANDOM_FRACTION || (!KEEP_AF_SPECTRUM && GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
vcfWriter.add(sub);
}
else {
if (SELECT_RANDOM_FRACTION && KEEP_AF_SPECTRUM ) {
// ok we have a comp VC and we need to match the AF spectrum of inputAFRodName.
// We then pick a variant with probablity AF*desiredFraction
if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) {
String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null);
double af;
double afBoost = 1.0;
if (afo.contains(",")) {
String[] afs = afo.split(",");
afs[0] = afs[0].substring(1,afs[0].length());
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
double[] afd = new double[afs.length];
for (int k=0; k < afd.length; k++)
afd[k] = Double.valueOf(afs[k]);
af = MathUtils.arrayMax(afd);
//af = Double.valueOf(afs[0]);
}
else
af = Double.valueOf(afo);
// now boost af by table read from file if desired
//double bkpt = 0.0;
int bkidx = 0;
if (!afBreakpoints.isEmpty()) {
for ( Double bkpt : afBreakpoints) {
if (af < bkpt + bkDelta)
break;
else bkidx++;
}
if (bkidx >=afBoosts.size())
bkidx = afBoosts.size()-1;
afBoost = afBoosts.get(bkidx);
//System.out.formatPrin("af:%f bkidx:%d afboost:%f\n",af,bkidx,afBoost);
}
//System.out.format("%s .. %4.4f\n",afo.toString(), af);
if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom * afBoost * afBoost)
vcfWriter.add(sub);
}
}
}
}
}