AnalyzeAnnotations seems to be popular so I've rewritten the guts to be easier to extend and maintain.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2707 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-01-27 19:30:31 +00:00
parent fa3589e5c5
commit fc4285f9fd
4 changed files with 140 additions and 158 deletions

View File

@ -14,9 +14,9 @@ c <- read.table(input, header=T)
# Plot TiTv ratio as a function of the annotation # Plot TiTv ratio as a function of the annotation
# #
all = c[c$numVariants>minBinCutoff & c$category==0,] all = c[c$numVariants>minBinCutoff & c$category=="all",]
novel = c[c$numVariants>minBinCutoff & c$category==1,] novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
dbsnp = c[c$numVariants>minBinCutoff & c$category==2,] dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
d = c[c$numVariants>minBinCutoff,] d = c[c$numVariants>minBinCutoff,]
ymin = min(d$titv) ymin = min(d$titv)

View File

@ -93,10 +93,10 @@ public class LocusOverflowTracker {
// is the last warning they'll see // is the last warning they'll see
if (warningsEmitted < MAX_WARNINGS) { if (warningsEmitted < MAX_WARNINGS) {
warningsEmitted++; warningsEmitted++;
Utils.warnUser("Unable to add a reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation); Utils.warnUser("Unable to add reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation);
} else if (warningsEmitted == MAX_WARNINGS) { } else if (warningsEmitted == MAX_WARNINGS) {
warningsEmitted++; warningsEmitted++;
Utils.warnUser("Unable to add a reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation + Utils.warnUser("Unable to add reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation +
"; the maximum warning count has been reached, we will no longer emit warnings of this nature!!"); "; the maximum warning count has been reached, we will no longer emit warnings of this nature!!");
} }
} }

View File

@ -41,22 +41,16 @@ import java.io.FileNotFoundException;
public class AnnotationDataManager { public class AnnotationDataManager {
public final HashMap<String, TreeSet<AnnotationDatum>> dataFull; public final HashMap<String, TreeSet<AnnotationDatum>> data;
public final HashMap<String, TreeSet<AnnotationDatum>> dataNovel;
public final HashMap<String, TreeSet<AnnotationDatum>> dataDBsnp;
public final HashMap<String, TreeSet<AnnotationDatum>> dataTruthSet;
public AnnotationDataManager() { public AnnotationDataManager() {
dataFull = new HashMap<String, TreeSet<AnnotationDatum>>(); data = new HashMap<String, TreeSet<AnnotationDatum>>();
dataNovel = new HashMap<String, TreeSet<AnnotationDatum>>();
dataDBsnp = new HashMap<String, TreeSet<AnnotationDatum>>();
dataTruthSet = new HashMap<String, TreeSet<AnnotationDatum>>();
} }
public void addAnnotations( RodVCF variant, String sampleName ) { public void addAnnotations( RodVCF variant, String sampleName ) {
if( sampleName != null ) { // only process variants that are found in the sample with this sampleName if( sampleName != null ) { // Only process variants that are found in the sample with this sampleName
if( variant.getGenotype(sampleName).isNoCall() ) { // this variant isn't found in this sample so break out if( variant.getGenotype(sampleName).isNoCall() ) { // This variant isn't found in this sample so break out
return; return;
} }
} // else, process all samples } // else, process all samples
@ -69,58 +63,17 @@ public class AnnotationDataManager {
boolean skipThisAnnotation = false; boolean skipThisAnnotation = false;
try { try {
value = Float.parseFloat( infoField.get( annotationKey ) ); value = Float.parseFloat( infoField.get( annotationKey ) );
} catch( Exception e ) { // BUGBUG: Make this exception more specific. NumberFormatException?? } catch( NumberFormatException e ) {
skipThisAnnotation = true; // skip over annotations that aren't floats, like "AC"?? and "DB" skipThisAnnotation = true; // Skip over annotations that aren't floats, like older versions of "AC" and "DB"
} }
if( skipThisAnnotation ) { if( skipThisAnnotation ) {
continue; // skip over annotations that aren't floats, like "AC"?? and "DB" continue; // Skip over annotations that aren't floats, like older versions of "AC" and "DB"
} }
if( variant.getID().equals(".") ) { // This is a novel variant TreeSet<AnnotationDatum> treeSet = data.get( annotationKey );
TreeSet<AnnotationDatum> treeSet = dataNovel.get( annotationKey );
if( treeSet == null ) { // This annotation hasn't been seen before
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
dataNovel.put( annotationKey, treeSet );
}
AnnotationDatum datum = new AnnotationDatum( value );
if( treeSet.contains(datum) ) {
datum = treeSet.tailSet(datum).first();
} else {
treeSet.add(datum);
}
// Decide if it was a Ti or a Tv
if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
datum.incrementTi();
} else {
datum.incrementTv();
}
} else { // This is a DBsnp variant
TreeSet<AnnotationDatum> treeSet = dataDBsnp.get( annotationKey );
if( treeSet == null ) { // This annotation hasn't been seen before
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
dataDBsnp.put( annotationKey, treeSet );
}
AnnotationDatum datum = new AnnotationDatum( value );
if( treeSet.contains(datum) ) {
datum = treeSet.tailSet(datum).first();
} else {
treeSet.add(datum);
}
// Decide if it was a Ti or a Tv
if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
datum.incrementTi();
} else {
datum.incrementTv();
}
}
// the overall set, containing both novel and DBsnp variants
TreeSet<AnnotationDatum> treeSet = dataFull.get( annotationKey );
if( treeSet == null ) { // This annotation hasn't been seen before if( treeSet == null ) { // This annotation hasn't been seen before
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
dataFull.put( annotationKey, treeSet ); data.put( annotationKey, treeSet );
} }
AnnotationDatum datum = new AnnotationDatum( value ); AnnotationDatum datum = new AnnotationDatum( value );
if( treeSet.contains(datum) ) { if( treeSet.contains(datum) ) {
@ -129,11 +82,14 @@ public class AnnotationDataManager {
treeSet.add(datum); treeSet.add(datum);
} }
final boolean isNovelVariant = variant.getID().equals(".");
final boolean isTrueVariant = false;
// Decide if it was a Ti or a Tv // Decide if it was a Ti or a Tv
if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) { if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
datum.incrementTi(); datum.incrementTi( isNovelVariant, isTrueVariant );
} else { } else {
datum.incrementTv(); datum.incrementTv( isNovelVariant, isTrueVariant );
} }
} }
} }
@ -143,12 +99,13 @@ public class AnnotationDataManager {
System.out.println( "\nExecuting RScript commands:" ); System.out.println( "\nExecuting RScript commands:" );
for( String annotationKey: dataFull.keySet() ) { // For each annotation we've seen
for( String annotationKey : data.keySet() ) {
PrintStream output; PrintStream output;
try { try {
output = new PrintStream(OUTPUT_PREFIX + annotationKey + ".dat"); output = new PrintStream(OUTPUT_PREFIX + annotationKey + ".dat"); // Create the data file for this annotation
} catch (FileNotFoundException e) { } catch ( FileNotFoundException e ) {
throw new StingException("Can't create intermediate output annotation data file. Does the output directory exist? " + throw new StingException("Can't create intermediate output annotation data file. Does the output directory exist? " +
OUTPUT_PREFIX + annotationKey + ".dat"); OUTPUT_PREFIX + annotationKey + ".dat");
} }
@ -157,77 +114,23 @@ public class AnnotationDataManager {
output.println("value\ttitv\tnumVariants\tcategory"); output.println("value\ttitv\tnumVariants\tcategory");
// Bin SNPs and calculate truth metrics for each bin, right now this is just TiTv // Bin SNPs and calculate truth metrics for each bin, right now this is just TiTv
int numTi = 0; AnnotationDatum thisAnnotationBin = new AnnotationDatum();
int numTv = 0; for( AnnotationDatum datum : data.get( annotationKey ) ) {
AnnotationDatum lastDatum = null; thisAnnotationBin.combine( datum );
for( AnnotationDatum datum : dataFull.get( annotationKey ) ) { if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
numTi += datum.numTransitions; output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
numTv += datum.numTransversions; output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
lastDatum = datum; output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
if( numTi + numTv >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full thisAnnotationBin.clearBin();
float titv;
if( numTv == 0) { titv = 0.0f; }
else { titv = ((float) numTi) / ((float) numTv); }
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t0");
numTi = 0;
numTv = 0;
} }
// else, continue accumulating variants because this bin isn't full yet // else, continue accumulating variants because this bin isn't full yet
} }
if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out
float titv;
if( numTv == 0) { titv = 0.0f; }
else { titv = ((float) numTi) / ((float) numTv); }
output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t0");
}
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) { // one final bin that may not have been dumped out
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
numTi = 0;
numTv = 0;
lastDatum = null;
for( AnnotationDatum datum : dataNovel.get( annotationKey ) ) {
numTi += datum.numTransitions;
numTv += datum.numTransversions;
lastDatum = datum;
if( numTi + numTv >= MAX_VARIANTS_PER_BIN/2 ) { // This annotation bin is full
float titv;
if( numTv == 0) { titv = 0.0f; }
else { titv = ((float) numTi) / ((float) numTv); }
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t1");
numTi = 0;
numTv = 0;
}
// else, continue accumulating variants because this bin isn't full yet
}
if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out
float titv;
if( numTv == 0) { titv = 0.0f; }
else { titv = ((float) numTi) / ((float) numTv); }
output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t1");
}
numTi = 0;
numTv = 0;
lastDatum = null;
for( AnnotationDatum datum : dataDBsnp.get( annotationKey ) ) {
numTi += datum.numTransitions;
numTv += datum.numTransversions;
lastDatum = datum;
if( numTi + numTv >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
float titv;
if( numTv == 0) { titv = 0.0f; }
else { titv = ((float) numTi) / ((float) numTv); }
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t2");
numTi = 0;
numTv = 0;
}
// else, continue accumulating variants because this bin isn't full yet
}
if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out
float titv;
if( numTv == 0) { titv = 0.0f; }
else { titv = ((float) numTi) / ((float) numTv); }
output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t2");
} }
output.close(); output.close();
@ -239,7 +142,7 @@ public class AnnotationDataManager {
try { try {
final Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " + final Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " +
OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationKey + " " + MIN_VARIANTS_PER_BIN); OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationKey + " " + MIN_VARIANTS_PER_BIN);
} catch (Exception e) { } catch ( Exception e ) {
throw new StingException( "Unable to execute RScript command" ); throw new StingException( "Unable to execute RScript command" );
} }
} }

View File

@ -37,50 +37,129 @@ import java.util.Comparator;
public class AnnotationDatum implements Comparator<AnnotationDatum> { public class AnnotationDatum implements Comparator<AnnotationDatum> {
public final float value; public float value;
public int numTransitions; private final int[] ti;
public int numTransversions; private final int[] tv;
public static final int FULL_SET = -1;
public static final int NOVEL_SET = 0;
public static final int DBSNP_SET = 1;
public static final int TRUTH_SET = 2;
public AnnotationDatum() { public AnnotationDatum() {
value = 0.0f; value = 0.0f;
numTransitions = 0; ti = new int[3];
numTransversions = 0; tv = new int[3];
for( int iii = 0; iii < 3; iii++ ) {
ti[iii] = 0;
tv[iii] = 0;
}
} }
public AnnotationDatum( float _value ) { public AnnotationDatum( float _value ) {
value = _value; value = _value;
numTransitions = 0; ti = new int[3];
numTransversions = 0; tv = new int[3];
} for( int iii = 0; iii < 3; iii++ ) {
ti[iii] = 0;
final public void incrementTi() { tv[iii] = 0;
numTransitions++;
}
final public void incrementTv() {
numTransversions++;
}
final public float calcTiTv() {
if( numTransitions < 0 || numTransversions < 0) {
throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
} }
}
if( numTransversions == 0) { final public void incrementTi( final boolean isNovelVariant, final boolean isTrueVariant ) {
if( isNovelVariant ) {
ti[NOVEL_SET]++;
} else { // Is known, in DBsnp
ti[DBSNP_SET]++;
}
if( isTrueVariant ) {
ti[TRUTH_SET]++;
}
}
final public void incrementTv( final boolean isNovelVariant, final boolean isTrueVariant ) {
if( isNovelVariant ) {
tv[NOVEL_SET]++;
} else { // Is known, in DBsnp
tv[DBSNP_SET]++;
}
if( isTrueVariant ) {
tv[TRUTH_SET]++;
}
}
final public void combine( final AnnotationDatum that ) {
for( int iii = 0; iii < 3; iii++ ) {
this.ti[iii] += that.ti[iii];
this.tv[iii] += that.tv[iii];
}
this.value = that.value; // overwrite this bin's value
}
final public float calcTiTv( final int INDEX ) {
if( INDEX == FULL_SET ) {
if( (ti[NOVEL_SET] + ti[DBSNP_SET]) < 0 || (tv[NOVEL_SET] + tv[DBSNP_SET]) < 0 ) {
throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
}
if( (tv[NOVEL_SET] + tv[DBSNP_SET]) == 0 ) { // Don't divide by zero
return 0.0f;
}
return ((float) (ti[NOVEL_SET] + ti[DBSNP_SET])) /
((float) (tv[NOVEL_SET] + tv[DBSNP_SET]));
} else {
if( ti[INDEX] < 0 || tv[INDEX] < 0 ) {
throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
}
if( tv[INDEX] == 0 ) { // Don't divide by zero
return 0.0f;
}
return ((float) ti[INDEX]) / ((float) tv[INDEX]);
}
}
final public float calcTPrate() {
if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero
return 0.0f; return 0.0f;
} }
return ((float) numTransitions) / ((float) numTransversions); return ((float) ti[TRUTH_SET] + tv[TRUTH_SET]) /
((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]);
} }
public int compare( AnnotationDatum a1, AnnotationDatum a2 ) { final public int numVariants( final int INDEX ) {
if( INDEX == FULL_SET ) {
return ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET];
} else {
return ti[INDEX] + tv[INDEX];
}
}
final public void clearBin() {
value = 0.0f;
for( int iii = 0; iii < 3; iii++ ) {
ti[iii] = 0;
tv[iii] = 0;
}
}
public int compare( AnnotationDatum a1, AnnotationDatum a2 ) { // Function needed for this to be a Comparator
if( a1.value < a2.value ) { return -1; } if( a1.value < a2.value ) { return -1; }
if( a1.value > a2.value ) { return 1; } if( a1.value > a2.value ) { return 1; }
return 0; return 0;
} }
public int equals( AnnotationDatum that ) { public int equals( AnnotationDatum that ) { // Function needed for this to be sorted correctly in a TreeSet
if( this.value < that.value ) { return -1; } if( this.value < that.value ) { return -1; }
if( this.value > that.value ) { return 1; } if( this.value > that.value ) { return 1; }
return 0; return 0;