AnalyzeAnnotations seems to be popular so I've rewritten the guts to be easier to extend and maintain.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2707 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fa3589e5c5
commit
fc4285f9fd
|
|
@ -14,9 +14,9 @@ c <- read.table(input, header=T)
|
|||
# Plot TiTv ratio as a function of the annotation
|
||||
#
|
||||
|
||||
all = c[c$numVariants>minBinCutoff & c$category==0,]
|
||||
novel = c[c$numVariants>minBinCutoff & c$category==1,]
|
||||
dbsnp = c[c$numVariants>minBinCutoff & c$category==2,]
|
||||
all = c[c$numVariants>minBinCutoff & c$category=="all",]
|
||||
novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
|
||||
dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
|
||||
|
||||
d = c[c$numVariants>minBinCutoff,]
|
||||
ymin = min(d$titv)
|
||||
|
|
|
|||
|
|
@ -93,10 +93,10 @@ public class LocusOverflowTracker {
|
|||
// is the last warning they'll see
|
||||
if (warningsEmitted < MAX_WARNINGS) {
|
||||
warningsEmitted++;
|
||||
Utils.warnUser("Unable to add a reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation);
|
||||
Utils.warnUser("Unable to add reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation);
|
||||
} else if (warningsEmitted == MAX_WARNINGS) {
|
||||
warningsEmitted++;
|
||||
Utils.warnUser("Unable to add a reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation +
|
||||
Utils.warnUser("Unable to add reads to the pile-up, we're over the hanger limit of " + maxPileupSize + " at location: " + lastLocation +
|
||||
"; the maximum warning count has been reached, we will no longer emit warnings of this nature!!");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,22 +41,16 @@ import java.io.FileNotFoundException;
|
|||
|
||||
public class AnnotationDataManager {
|
||||
|
||||
public final HashMap<String, TreeSet<AnnotationDatum>> dataFull;
|
||||
public final HashMap<String, TreeSet<AnnotationDatum>> dataNovel;
|
||||
public final HashMap<String, TreeSet<AnnotationDatum>> dataDBsnp;
|
||||
public final HashMap<String, TreeSet<AnnotationDatum>> dataTruthSet;
|
||||
public final HashMap<String, TreeSet<AnnotationDatum>> data;
|
||||
|
||||
public AnnotationDataManager() {
|
||||
dataFull = new HashMap<String, TreeSet<AnnotationDatum>>();
|
||||
dataNovel = new HashMap<String, TreeSet<AnnotationDatum>>();
|
||||
dataDBsnp = new HashMap<String, TreeSet<AnnotationDatum>>();
|
||||
dataTruthSet = new HashMap<String, TreeSet<AnnotationDatum>>();
|
||||
data = new HashMap<String, TreeSet<AnnotationDatum>>();
|
||||
}
|
||||
|
||||
public void addAnnotations( RodVCF variant, String sampleName ) {
|
||||
|
||||
if( sampleName != null ) { // only process variants that are found in the sample with this sampleName
|
||||
if( variant.getGenotype(sampleName).isNoCall() ) { // this variant isn't found in this sample so break out
|
||||
if( sampleName != null ) { // Only process variants that are found in the sample with this sampleName
|
||||
if( variant.getGenotype(sampleName).isNoCall() ) { // This variant isn't found in this sample so break out
|
||||
return;
|
||||
}
|
||||
} // else, process all samples
|
||||
|
|
@ -69,58 +63,17 @@ public class AnnotationDataManager {
|
|||
boolean skipThisAnnotation = false;
|
||||
try {
|
||||
value = Float.parseFloat( infoField.get( annotationKey ) );
|
||||
} catch( Exception e ) { // BUGBUG: Make this exception more specific. NumberFormatException??
|
||||
skipThisAnnotation = true; // skip over annotations that aren't floats, like "AC"?? and "DB"
|
||||
} catch( NumberFormatException e ) {
|
||||
skipThisAnnotation = true; // Skip over annotations that aren't floats, like older versions of "AC" and "DB"
|
||||
}
|
||||
if( skipThisAnnotation ) {
|
||||
continue; // skip over annotations that aren't floats, like "AC"?? and "DB"
|
||||
continue; // Skip over annotations that aren't floats, like older versions of "AC" and "DB"
|
||||
}
|
||||
|
||||
if( variant.getID().equals(".") ) { // This is a novel variant
|
||||
TreeSet<AnnotationDatum> treeSet = dataNovel.get( annotationKey );
|
||||
if( treeSet == null ) { // This annotation hasn't been seen before
|
||||
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
|
||||
dataNovel.put( annotationKey, treeSet );
|
||||
}
|
||||
AnnotationDatum datum = new AnnotationDatum( value );
|
||||
if( treeSet.contains(datum) ) {
|
||||
datum = treeSet.tailSet(datum).first();
|
||||
} else {
|
||||
treeSet.add(datum);
|
||||
}
|
||||
|
||||
// Decide if it was a Ti or a Tv
|
||||
if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
|
||||
datum.incrementTi();
|
||||
} else {
|
||||
datum.incrementTv();
|
||||
}
|
||||
} else { // This is a DBsnp variant
|
||||
TreeSet<AnnotationDatum> treeSet = dataDBsnp.get( annotationKey );
|
||||
if( treeSet == null ) { // This annotation hasn't been seen before
|
||||
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
|
||||
dataDBsnp.put( annotationKey, treeSet );
|
||||
}
|
||||
AnnotationDatum datum = new AnnotationDatum( value );
|
||||
if( treeSet.contains(datum) ) {
|
||||
datum = treeSet.tailSet(datum).first();
|
||||
} else {
|
||||
treeSet.add(datum);
|
||||
}
|
||||
|
||||
// Decide if it was a Ti or a Tv
|
||||
if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
|
||||
datum.incrementTi();
|
||||
} else {
|
||||
datum.incrementTv();
|
||||
}
|
||||
}
|
||||
|
||||
// the overall set, containing both novel and DBsnp variants
|
||||
TreeSet<AnnotationDatum> treeSet = dataFull.get( annotationKey );
|
||||
TreeSet<AnnotationDatum> treeSet = data.get( annotationKey );
|
||||
if( treeSet == null ) { // This annotation hasn't been seen before
|
||||
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
|
||||
dataFull.put( annotationKey, treeSet );
|
||||
data.put( annotationKey, treeSet );
|
||||
}
|
||||
AnnotationDatum datum = new AnnotationDatum( value );
|
||||
if( treeSet.contains(datum) ) {
|
||||
|
|
@ -129,11 +82,14 @@ public class AnnotationDataManager {
|
|||
treeSet.add(datum);
|
||||
}
|
||||
|
||||
final boolean isNovelVariant = variant.getID().equals(".");
|
||||
final boolean isTrueVariant = false;
|
||||
|
||||
// Decide if it was a Ti or a Tv
|
||||
if( BaseUtils.isTransition(variant.getReferenceForSNP(), variant.getAlternativeBaseForSNP()) ) {
|
||||
datum.incrementTi();
|
||||
datum.incrementTi( isNovelVariant, isTrueVariant );
|
||||
} else {
|
||||
datum.incrementTv();
|
||||
datum.incrementTv( isNovelVariant, isTrueVariant );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -143,12 +99,13 @@ public class AnnotationDataManager {
|
|||
|
||||
System.out.println( "\nExecuting RScript commands:" );
|
||||
|
||||
for( String annotationKey: dataFull.keySet() ) {
|
||||
// For each annotation we've seen
|
||||
for( String annotationKey : data.keySet() ) {
|
||||
|
||||
PrintStream output;
|
||||
try {
|
||||
output = new PrintStream(OUTPUT_PREFIX + annotationKey + ".dat");
|
||||
} catch (FileNotFoundException e) {
|
||||
output = new PrintStream(OUTPUT_PREFIX + annotationKey + ".dat"); // Create the data file for this annotation
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new StingException("Can't create intermediate output annotation data file. Does the output directory exist? " +
|
||||
OUTPUT_PREFIX + annotationKey + ".dat");
|
||||
}
|
||||
|
|
@ -157,77 +114,23 @@ public class AnnotationDataManager {
|
|||
output.println("value\ttitv\tnumVariants\tcategory");
|
||||
|
||||
// Bin SNPs and calculate truth metrics for each bin, right now this is just TiTv
|
||||
int numTi = 0;
|
||||
int numTv = 0;
|
||||
AnnotationDatum lastDatum = null;
|
||||
for( AnnotationDatum datum : dataFull.get( annotationKey ) ) {
|
||||
numTi += datum.numTransitions;
|
||||
numTv += datum.numTransversions;
|
||||
lastDatum = datum;
|
||||
if( numTi + numTv >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
|
||||
float titv;
|
||||
if( numTv == 0) { titv = 0.0f; }
|
||||
else { titv = ((float) numTi) / ((float) numTv); }
|
||||
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t0");
|
||||
numTi = 0;
|
||||
numTv = 0;
|
||||
AnnotationDatum thisAnnotationBin = new AnnotationDatum();
|
||||
for( AnnotationDatum datum : data.get( annotationKey ) ) {
|
||||
thisAnnotationBin.combine( datum );
|
||||
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
|
||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
|
||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
|
||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
|
||||
thisAnnotationBin.clearBin();
|
||||
}
|
||||
// else, continue accumulating variants because this bin isn't full yet
|
||||
}
|
||||
if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out
|
||||
float titv;
|
||||
if( numTv == 0) { titv = 0.0f; }
|
||||
else { titv = ((float) numTi) / ((float) numTv); }
|
||||
output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t0");
|
||||
}
|
||||
|
||||
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) { // one final bin that may not have been dumped out
|
||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
|
||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
|
||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
|
||||
|
||||
numTi = 0;
|
||||
numTv = 0;
|
||||
lastDatum = null;
|
||||
for( AnnotationDatum datum : dataNovel.get( annotationKey ) ) {
|
||||
numTi += datum.numTransitions;
|
||||
numTv += datum.numTransversions;
|
||||
lastDatum = datum;
|
||||
if( numTi + numTv >= MAX_VARIANTS_PER_BIN/2 ) { // This annotation bin is full
|
||||
float titv;
|
||||
if( numTv == 0) { titv = 0.0f; }
|
||||
else { titv = ((float) numTi) / ((float) numTv); }
|
||||
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t1");
|
||||
numTi = 0;
|
||||
numTv = 0;
|
||||
}
|
||||
// else, continue accumulating variants because this bin isn't full yet
|
||||
}
|
||||
if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out
|
||||
float titv;
|
||||
if( numTv == 0) { titv = 0.0f; }
|
||||
else { titv = ((float) numTi) / ((float) numTv); }
|
||||
output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t1");
|
||||
}
|
||||
|
||||
numTi = 0;
|
||||
numTv = 0;
|
||||
lastDatum = null;
|
||||
for( AnnotationDatum datum : dataDBsnp.get( annotationKey ) ) {
|
||||
numTi += datum.numTransitions;
|
||||
numTv += datum.numTransversions;
|
||||
lastDatum = datum;
|
||||
if( numTi + numTv >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
|
||||
float titv;
|
||||
if( numTv == 0) { titv = 0.0f; }
|
||||
else { titv = ((float) numTi) / ((float) numTv); }
|
||||
output.println(datum.value + "\t" + titv + "\t" + (numTi+numTv) + "\t2");
|
||||
numTi = 0;
|
||||
numTv = 0;
|
||||
}
|
||||
// else, continue accumulating variants because this bin isn't full yet
|
||||
}
|
||||
if( numTi != 0 || numTv != 0 ) { // one final bin that may not have been dumped out
|
||||
float titv;
|
||||
if( numTv == 0) { titv = 0.0f; }
|
||||
else { titv = ((float) numTi) / ((float) numTv); }
|
||||
output.println(lastDatum.value + "\t" + titv + "\t" + (numTi+numTv)+ "\t2");
|
||||
}
|
||||
|
||||
output.close();
|
||||
|
|
@ -239,7 +142,7 @@ public class AnnotationDataManager {
|
|||
try {
|
||||
final Process p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTiTv.R" + " " +
|
||||
OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationKey + " " + MIN_VARIANTS_PER_BIN);
|
||||
} catch (Exception e) {
|
||||
} catch ( Exception e ) {
|
||||
throw new StingException( "Unable to execute RScript command" );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -37,50 +37,129 @@ import java.util.Comparator;
|
|||
|
||||
public class AnnotationDatum implements Comparator<AnnotationDatum> {
|
||||
|
||||
public final float value;
|
||||
public int numTransitions;
|
||||
public int numTransversions;
|
||||
public float value;
|
||||
private final int[] ti;
|
||||
private final int[] tv;
|
||||
|
||||
public static final int FULL_SET = -1;
|
||||
public static final int NOVEL_SET = 0;
|
||||
public static final int DBSNP_SET = 1;
|
||||
public static final int TRUTH_SET = 2;
|
||||
|
||||
public AnnotationDatum() {
|
||||
|
||||
value = 0.0f;
|
||||
numTransitions = 0;
|
||||
numTransversions = 0;
|
||||
ti = new int[3];
|
||||
tv = new int[3];
|
||||
for( int iii = 0; iii < 3; iii++ ) {
|
||||
ti[iii] = 0;
|
||||
tv[iii] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public AnnotationDatum( float _value ) {
|
||||
|
||||
value = _value;
|
||||
numTransitions = 0;
|
||||
numTransversions = 0;
|
||||
}
|
||||
|
||||
final public void incrementTi() {
|
||||
numTransitions++;
|
||||
}
|
||||
|
||||
final public void incrementTv() {
|
||||
numTransversions++;
|
||||
}
|
||||
|
||||
final public float calcTiTv() {
|
||||
|
||||
if( numTransitions < 0 || numTransversions < 0) {
|
||||
throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
|
||||
ti = new int[3];
|
||||
tv = new int[3];
|
||||
for( int iii = 0; iii < 3; iii++ ) {
|
||||
ti[iii] = 0;
|
||||
tv[iii] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if( numTransversions == 0) {
|
||||
final public void incrementTi( final boolean isNovelVariant, final boolean isTrueVariant ) {
|
||||
|
||||
if( isNovelVariant ) {
|
||||
ti[NOVEL_SET]++;
|
||||
} else { // Is known, in DBsnp
|
||||
ti[DBSNP_SET]++;
|
||||
}
|
||||
if( isTrueVariant ) {
|
||||
ti[TRUTH_SET]++;
|
||||
}
|
||||
}
|
||||
|
||||
final public void incrementTv( final boolean isNovelVariant, final boolean isTrueVariant ) {
|
||||
|
||||
if( isNovelVariant ) {
|
||||
tv[NOVEL_SET]++;
|
||||
} else { // Is known, in DBsnp
|
||||
tv[DBSNP_SET]++;
|
||||
}
|
||||
if( isTrueVariant ) {
|
||||
tv[TRUTH_SET]++;
|
||||
}
|
||||
}
|
||||
|
||||
final public void combine( final AnnotationDatum that ) {
|
||||
|
||||
for( int iii = 0; iii < 3; iii++ ) {
|
||||
this.ti[iii] += that.ti[iii];
|
||||
this.tv[iii] += that.tv[iii];
|
||||
}
|
||||
this.value = that.value; // overwrite this bin's value
|
||||
}
|
||||
|
||||
final public float calcTiTv( final int INDEX ) {
|
||||
|
||||
if( INDEX == FULL_SET ) {
|
||||
if( (ti[NOVEL_SET] + ti[DBSNP_SET]) < 0 || (tv[NOVEL_SET] + tv[DBSNP_SET]) < 0 ) {
|
||||
throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
|
||||
}
|
||||
|
||||
if( (tv[NOVEL_SET] + tv[DBSNP_SET]) == 0 ) { // Don't divide by zero
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
return ((float) (ti[NOVEL_SET] + ti[DBSNP_SET])) /
|
||||
((float) (tv[NOVEL_SET] + tv[DBSNP_SET]));
|
||||
} else {
|
||||
if( ti[INDEX] < 0 || tv[INDEX] < 0 ) {
|
||||
throw new StingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
|
||||
}
|
||||
|
||||
if( tv[INDEX] == 0 ) { // Don't divide by zero
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
return ((float) ti[INDEX]) / ((float) tv[INDEX]);
|
||||
}
|
||||
}
|
||||
|
||||
final public float calcTPrate() {
|
||||
|
||||
if( ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET] == 0 ) { // Don't divide by zero
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
return ((float) numTransitions) / ((float) numTransversions);
|
||||
return ((float) ti[TRUTH_SET] + tv[TRUTH_SET]) /
|
||||
((float) ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET]);
|
||||
}
|
||||
|
||||
public int compare( AnnotationDatum a1, AnnotationDatum a2 ) {
|
||||
final public int numVariants( final int INDEX ) {
|
||||
if( INDEX == FULL_SET ) {
|
||||
return ti[NOVEL_SET] + tv[NOVEL_SET] + ti[DBSNP_SET] + tv[DBSNP_SET];
|
||||
} else {
|
||||
return ti[INDEX] + tv[INDEX];
|
||||
}
|
||||
}
|
||||
|
||||
final public void clearBin() {
|
||||
value = 0.0f;
|
||||
for( int iii = 0; iii < 3; iii++ ) {
|
||||
ti[iii] = 0;
|
||||
tv[iii] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public int compare( AnnotationDatum a1, AnnotationDatum a2 ) { // Function needed for this to be a Comparator
|
||||
if( a1.value < a2.value ) { return -1; }
|
||||
if( a1.value > a2.value ) { return 1; }
|
||||
return 0;
|
||||
}
|
||||
|
||||
public int equals( AnnotationDatum that ) {
|
||||
public int equals( AnnotationDatum that ) { // Function needed for this to be sorted correctly in a TreeSet
|
||||
if( this.value < that.value ) { return -1; }
|
||||
if( this.value > that.value ) { return 1; }
|
||||
return 0;
|
||||
|
|
|
|||
Loading…
Reference in New Issue