warn user when dbSNP rod looks suspicious

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1400 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-08-10 20:20:20 +00:00
parent 2841e151d0
commit ecae619a1b
1 changed files with 55 additions and 0 deletions

View File

@ -10,6 +10,8 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.BaseUtils;
import java.util.*;
import java.io.PrintStream;
@ -45,6 +47,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
private long counted_bases = 0; // number of bases used to count covariates
private long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0l, 0l); // mismatch/base counts for dbSNP loci
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0l, 0l); // mismatch/base counts for non-dbSNP loci
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
private static final int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
/**
* Initialize the system. Setup the data CovariateCountry for the read groups in our header
@ -107,13 +114,61 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
}
counted_sites += 1;
updateMismatchCounts(novel_counts, context, ref.getBase());
} else {
updateMismatchCounts(dbSNP_counts, context, ref.getBase());
skipped_sites += 1;
}
if ( ++lociSinceLastDbsnpCheck == DBSNP_VALIDATION_CHECK_FREQUENCY ) {
lociSinceLastDbsnpCheck = 0;
validateDbsnpMismatchRate();
}
return 1;
}
/**
* Update the mismatch / total_base counts for a given class of loci.
*
* @param counts
* @param context
* @return
*/
private static void updateMismatchCounts(Pair<Long, Long> counts, AlignmentContext context, char ref) {
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i =0; i < reads.size(); i++ ) {
char readChar = reads.get(i).getReadString().charAt(offsets.get(i));
int readCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(readChar);
int refCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref);
if ( readCharBaseIndex != -1 && refCharBaseIndex != -1 ) {
if ( readCharBaseIndex != refCharBaseIndex )
counts.first++;
counts.second++;
}
}
}
/**
* Validate the dbSNP mismatch rates.
*
* @return
*/
private void validateDbsnpMismatchRate() {
if ( novel_counts.second == 0 || dbSNP_counts.second == 0 )
return;
double fractionMM_novel = (double)novel_counts.first / (double)novel_counts.second;
double fractionMM_dbsnp = (double)dbSNP_counts.first / (double)dbSNP_counts.second;
//System.out.println(String.format("dbSNP rate = %.2f, novel rate = %.2f", fractionMM_dbsnp, fractionMM_novel));
if ( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel )
Utils.warnUser("The variation rate of dbSNP sites seems suspicious! Please double-check that the correct DBSNP ROD is being used.");
}
/**
* Check to see whether this read group should be processed. Returns true if the
* read group is in the list of platforms to process or the platform == *, indicating