AnalyzeAnnotations seems to be popular so I've rewritten the guts to be easier to extend and maintain.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2707 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-01-27 19:30:31 +00:00
parent fa3589e5c5
commit fc4285f9fd
4 changed files with 140 additions and 158 deletions

View File

@ -14,9 +14,9 @@ c <- read.table(input, header=T)
# Plot TiTv ratio as a function of the annotation
#
all = c[c$numVariants>minBinCutoff & c$category==0,]
novel = c[c$numVariants>minBinCutoff & c$category==1,]
dbsnp = c[c$numVariants>minBinCutoff & c$category==2,]
all = c[c$numVariants>minBinCutoff & c$category=="all",]
novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
d = c[c$numVariants>minBinCutoff,]
ymin = min(d$titv)

View File

@ -93,10 +93,10 @@ public class LocusOverflowTracker {
// is the last warning they'll see
if (warningsEmitted < MAX_WARNINGS) {
warningsEmitted++;
Utils.warnUser("Unable to add a reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation);
Utils.warnUser("Unable to add reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation);
} else if (warningsEmitted == MAX_WARNINGS) {
warningsEmitted++;
Utils.warnUser("Unable to add a reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation +
Utils.warnUser("Unable to add reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation +
"; the maximum warning count has been reached, we will no longer emit warnings of this nature!!");
}
}

View File

@ -41,22 +41,16 @@ import java.io.FileNotFoundException;
public class AnnotationDataManager {
public final HashMap<String, TreeSet<AnnotationDatum>> dataFull;
public final HashMap<String, TreeSet<AnnotationDatum>> dataNovel;
public final HashMap<String, TreeSet<AnnotationDatum>> dataDBsnp;
public final HashMap<String, TreeSet<AnnotationDatum>> dataTruthSet;
public final HashMap<String, TreeSet<AnnotationDatum>> data;
public AnnotationDataManager() {
dataFull = new HashMap<String, TreeSet<AnnotationDatum>>();
dataNovel = new HashMap<String, TreeSet<AnnotationDatum>>();
dataDBsnp = new HashMap<String, TreeSet<AnnotationDatum>>();
dataTruthSet = new HashMap<String, TreeSet<AnnotationDatum>>();
data = new HashMap<String, TreeSet<AnnotationDatum>>();
}
public void addAnnotations( RodVCF variant, String sampleName ) {
if( sampleName != null ) { // only process variants that are found in the sample with this sampleName
if( variant.getGenotype(sampleName).isNoCall() ) { // this variant isn't found in this sample so break out
if( sampleName != null ) { // Only process variants that are found in the sample with this sampleName
if( variant.getGenotype(sampleName).isNoCall() ) { // This variant isn't found in this sample so break out
return;
}
} // else, process all samples
@ -69,58 +63,17 @@ public class AnnotationDataManager {
boolean skipThisAnnotation = false;
try {
value = Float.parseFloat( infoField.get( annotationKey ) );
} catch( Exception e ) { // BUGBUG: Make this exception more specific. NumberFormatException??
skipThisAnnotation = true; // skip over annotations that aren't floats, like "AC"?? and "DB"
} catch( NumberFormatException e ) {
skipThisAnnotation = true; // Skip over annotations that aren't floats, like older versions of "AC" and "DB"
}
if( skipThisAnnotation ) {
continue; // skip over annotations that aren't floats, like "AC"?? and "DB"
continue; // Skip over annotations that aren't floats, like older versions of "AC" and "DB"
}
if( variant.getID().equals(".") ) { // This is a novel variant
TreeSet<AnnotationDatum> treeSet = dataNovel.get( annotationKey );
if( treeSet == null ) { // This annotation hasn't been seen before
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
dataNovel.put( annotationKey, treeSet );
}
AnnotationDatum datum = new AnnotationDatum( value );
if( treeSet.contains(datum) ) {
datum = treeSet.tailSet(datum).first();
} else {
treeSet.add(datum);
}
// Decide if it was a Ti or a Tv
if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
datum.incrementTi();
} else {
datum.incrementTv();
}
} else { // This is a DBsnp variant
TreeSet<AnnotationDatum> treeSet = dataDBsnp.get( annotationKey );
if( treeSet == null ) { // This annotation hasn't been seen before
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
dataDBsnp.put( annotationKey, treeSet );
}
AnnotationDatum datum = new AnnotationDatum( value );
if( treeSet.contains(datum) ) {
datum = treeSet.tailSet(datum).first();
} else {
treeSet.add(datum);
}
// Decide if it was a Ti or a Tv
if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
datum.incrementTi();
} else {
datum.incrementTv();
}
}
// the overall set, containing both novel and DBsnp variants
TreeSet<AnnotationDatum> treeSet = dataFull.get( annotationKey );
TreeSet<AnnotationDatum> treeSet = data.get( annotationKey );
if( treeSet == null ) { // This annotation hasn't been seen before
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
dataFull.put( annotationKey, treeSet );
data.put( annotationKey, treeSet );
}
AnnotationDatum datum = new AnnotationDatum( value );
if( treeSet.contains(datum) ) {
@ -129,11 +82,14 @@ public class AnnotationDataManager {
treeSet.add(datum);
}
final boolean isNovelVariant = variant.getID().equals(".");
final boolean isTrueVariant = false;
// Decide if it was a Ti or a Tv
if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
datum.incrementTi();
datum.incrementTi( isNovelVariant, isTrueVariant );
} else {
datum.incrementTv();
datum.incrementTv( isNovelVariant, isTrueVariant );
}
}
}
@ -143,12 +99,13 @@ public class AnnotationDataManager {
System.out.println( "\nExecuting RScript commands:" );
for( String annotationKey: dataFull.keySet() ) {
// For each annotation we've seen
for( String annotationKey : data.keySet() ) {
PrintStream output;
try {
output = new PrintStream(OUTPUT_PREFIX + annotationKey + ".dat");
} catch (FileNotFoundException e) {
output = new PrintStream(OUTPUT_PREFIX + annotationKey + ".dat"); // Create the data file for this annotation
} catch ( FileNotFoundException e ) {
throw new StingException("Can't create intermediate output annotation data file. Does the output directory exist? " +
OUTPUT_PREFIX + annotationKey + ".dat");
}
@ -157,77 +114,23 @@ public class AnnotationDataManager {
output.println("value\ttitv\tnumVariants\tcategory");
// Bin SNPs and calculate truth metrics for each bin, right now this is just TiTv
int numTi = 0;
int numTv = 0;
AnnotationDatum lastDatum = null;
for( AnnotationDatum datum : dataFull.get( annotationKey ) ) {
numTi += datum.numTransitions;
numTv += datum.numTransversions;
lastDatum = datum;
if( numTi + numTv >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
float titv;
if( numTv == 0) { titv = 0.0f; }
else { titv = ((float) numTi) / ((float) numTv); }
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t0");
numTi = 0;
numTv = 0;
AnnotationDatum thisAnnotationBin = new AnnotationDatum();
for( AnnotationDatum datum : data.get( annotationKey ) ) {
thisAnnotationBin.combine( datum );
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
thisAnnotationBin.clearBin();
}
// else, continue accumulating variants because this bin isn't full yet
}
if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out
float titv;
if( numTv == 0) { titv = 0.0f; }
else { titv = ((float) numTi) / ((float) numTv); }
output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t0");
}
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) { // one final bin that may not have been dumped out
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
numTi = 0;
numTv = 0;
lastDatum = null;
for( AnnotationDatum datum : dataNovel.get( annotationKey ) ) {
numTi += datum.numTransitions;
numTv += datum.numTransversions;
lastDatum = datum;
if( numTi + numTv >= MAX_VARIANTS_PER_BIN/2 ) { // This annotation bin is full
float titv;
if( numTv == 0) { titv = 0.0f; }
else { titv = ((float) numTi) / ((float) numTv); }
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t1");
numTi = 0;
numTv = 0;
}
// else, continue accumulating variants because this bin isn't full yet
}
if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out
float titv;
if( numTv == 0) { titv = 0.0f; }
else { titv = ((float) numTi) / ((float) numTv); }
output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t1");
}
numTi = 0;
numTv = 0;
lastDatum = null;
for( AnnotationDatum datum : dataDBsnp.get( annotationKey ) ) {
numTi += datum.numTransitions;
numTv += datum.numTransversions;
lastDatum = datum;
if( numTi + numTv >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
float titv;
if( numTv == 0) { titv = 0.0f; }
else { titv = ((float) numTi) / ((float) numTv); }
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t2");
numTi = 0;
numTv = 0;
}
// else, continue accumulating variants because this bin isn't full yet
}
if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out
float titv;
if( numTv == 0) { titv = 0.0f; }
else { titv = ((float) numTi) / ((float) numTv); }
output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t2");
}
output.close();
@ -239,7 +142,7 @@ public class AnnotationDataManager {
try {
final Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " +
OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationKey + " " + MIN_VARIANTS_PER_BIN);
} catch (Exception e) {
} catch ( Exception e ) {
throw new StingException( "Unable to execute RScript command" );
}
}

View File

@ -37,50 +37,129 @@ import java.util.Comparator;
public class AnnotationDatum implements Comparator<AnnotationDatum> {
public final float value;
public int numTransitions;
public int numTransversions;
public float value;
private final int[] ti;
private final int[] tv;
public static final int FULL_SET = -1;
public static final int NOVEL_SET = 0;
public static final int DBSNP_SET = 1;
public static final int TRUTH_SET = 2;
public AnnotationDatum() {
value = 0.0f;
numTransitions = 0;
numTransversions = 0;
ti = new int[3];
tv = new int[3];
for( int iii = 0; iii < 3; iii++ ) {
ti[iii] = 0;
tv[iii] = 0;
}
}
public AnnotationDatum( float _value ) {
value = _value;
numTransitions = 0;
numTransversions = 0;
}
final public void incrementTi() {
numTransitions++;
}
final public void incrementTv() {
numTransversions++;
}
final public float calcTiTv() {
if( numTransitions < 0 || numTransversions < 0) {
throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
ti = new int[3];
tv = new int[3];
for( int iii = 0; iii < 3; iii++ ) {
ti[iii] = 0;
tv[iii] = 0;
}
}
if( numTransversions == 0) {
final public void incrementTi( final boolean isNovelVariant, final boolean isTrueVariant ) {
if( isNovelVariant ) {
ti[NOVEL_SET]++;
} else { // Is known, in DBsnp
ti[DBSNP_SET]++;
}
if( isTrueVariant ) {
ti[TRUTH_SET]++;
}
}
final public void incrementTv( final boolean isNovelVariant, final boolean isTrueVariant ) {
if( isNovelVariant ) {
tv[NOVEL_SET]++;
} else { // Is known, in DBsnp
tv[DBSNP_SET]++;
}
if( isTrueVariant ) {
tv[TRUTH_SET]++;
}
}
final public void combine( final AnnotationDatum that ) {
for( int iii = 0; iii < 3; iii++ ) {
this.ti[iii] += that.ti[iii];
this.tv[iii] += that.tv[iii];
}
this.value = that.value; // overwrite this bin's value
}
final public float calcTiTv( final int INDEX ) {
if( INDEX == FULL_SET ) {
if( (ti[NOVEL_SET] + ti[DBSNP_SET]) < 0 || (tv[NOVEL_SET] + tv[DBSNP_SET]) < 0 ) {
throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
}
if( (tv[NOVEL_SET] + tv[DBSNP_SET]) == 0 ) { // Don't divide by zero
return 0.0f;
}
return ((float) (ti[NOVEL_SET] + ti[DBSNP_SET])) /
((float) (tv[NOVEL_SET] + tv[DBSNP_SET]));
} else {
if( ti[INDEX] < 0 || tv[INDEX] < 0 ) {
throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
}
if( tv[INDEX] == 0 ) { // Don't divide by zero
return 0.0f;
}
return ((float) ti[INDEX]) / ((float) tv[INDEX]);
}
}
final public float calcTPrate() {
if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero
return 0.0f;
}
return ((float) numTransitions) / ((float) numTransversions);
return ((float) ti[TRUTH_SET] + tv[TRUTH_SET]) /
((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]);
}
public int compare( AnnotationDatum a1, AnnotationDatum a2 ) {
final public int numVariants( final int INDEX ) {
if( INDEX == FULL_SET ) {
return ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET];
} else {
return ti[INDEX] + tv[INDEX];
}
}
final public void clearBin() {
value = 0.0f;
for( int iii = 0; iii < 3; iii++ ) {
ti[iii] = 0;
tv[iii] = 0;
}
}
public int compare( AnnotationDatum a1, AnnotationDatum a2 ) { // Function needed for this to be a Comparator
if( a1.value < a2.value ) { return -1; }
if( a1.value > a2.value ) { return 1; }
return 0;
}
public int equals( AnnotationDatum that ) {
public int equals( AnnotationDatum that ) { // Function needed for this to be sorted correctly in a TreeSet
if( this.value < that.value ) { return -1; }
if( this.value > that.value ) { return 1; }
return 0;