now has two modes: one sample - just call indel sites; two samples - call somatic-looking variants only. Still uses heuristic count-based cutoffs, cutoffs are hardcoded and are pretty conservative...

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@967 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-06-10 16:41:38 +00:00
parent 5451bbfd5a
commit 06e5a765f8
1 changed files with 220 additions and 19 deletions

View File

@ -1,25 +1,77 @@
package org.broadinstitute.sting.playground.gatk.walkers.indels;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.playground.utils.CircularArray;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument;
public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
@Argument(fullName="bed", shortName="bed", doc="BED output file name", required=true)
public java.io.File bed_file;
@Argument(fullName="somatic", shortName="somatic",
doc="Perform somatic calls; two input alignment files must be specified", required=false)
public boolean call_somatic = false;
private static int WINDOW_SIZE = 200;
private RunningCoverage coverage;
private RunningCoverage normal_coverage; // when performing somatic calls, we will be using this one for normal, and 'coverage' for tumor
private int currentContigIndex = -1;
private String refName = null;
private java.io.Writer output = null;
private Set<String> normal_samples = new HashSet<String>();
private Set<String> tumor_samples = new HashSet<String>();
@Override
public void initialize() {
coverage = new RunningCoverage(0,100);
coverage = new RunningCoverage(0,WINDOW_SIZE);
int nSams = getToolkit().getArguments().samFiles.size();
if ( call_somatic ) {
if ( nSams != 2 ) {
System.out.println("In --somatic mode two input bam files must be specified (normal/tumor)");
System.exit(1);
}
normal_coverage = new RunningCoverage(0,WINDOW_SIZE);
// this is an ugly hack: we want to be able to tell what file (tumor/normal sample) each read came from,
// but reads do not carry this information!
SAMFileReader rn = new SAMFileReader(getToolkit().getArguments().samFiles.get(0));
for ( SAMReadGroupRecord rec : rn.getFileHeader().getReadGroups() ) {
normal_samples.add(rec.getSample());
}
rn.close();
rn = new SAMFileReader(getToolkit().getArguments().samFiles.get(1));
for ( SAMReadGroupRecord rec : rn.getFileHeader().getReadGroups() ) {
tumor_samples.add(rec.getSample());
}
rn.close();
} else {
if ( nSams != 1 ) System.out.println("WARNING: multiple input files specified. \n"+
"WARNING: Without --somatic option they will be merged and processed as a single sample");
}
try {
output = new java.io.FileWriter(bed_file);
} catch (IOException e) {
throw new StingException("Failed to open file for writing BED output");
}
}
@Override
@ -30,13 +82,19 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
read.getNotPrimaryAlignmentFlag() ||
read.getMappingQuality() == 0 ) return 0; // we do not need those reads!
if ( read.getReferenceIndex() != currentContigIndex ) {
if ( read.getReferenceIndex() != currentContigIndex ) {
// we just jumped onto a new contig
if ( read.getReferenceIndex() < currentContigIndex ) // paranoidal
throw new StingException("Read "+read.getReadName()+": contig is out of order");
if ( call_somatic) emit_somatic(1000000000); // print remaining indels from the previous contig (if any);
else emit(1000000000);
currentContigIndex = read.getReferenceIndex();
refName = new String(read.getReferenceName());
coverage.clear(); // reset coverage window; this will also set reference position to 0
if ( call_somatic) normal_coverage.clear();
}
if ( read.getAlignmentStart() < coverage.getStart() ) {
@ -46,30 +104,164 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
// reads are sorted; we are not going to see any more coverage or new indels prior
// to current read's start position!
for ( long pos = coverage.getStart() ; pos < Math.min(read.getAlignmentStart(),coverage.getStop()+1) ; pos++ ) {
List<IndelVariant> variants = coverage.indelsAt(pos);
if ( variants.size() == 0 ) continue;
System.out.print(read.getReferenceName()+"\t"+pos+"\t"+coverage.coverageAt(pos));
for ( IndelVariant var : variants ) {
System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount());
}
System.out.println();
}
coverage.shift((int)(read.getAlignmentStart() - coverage.getStart() ) );
if ( call_somatic ) emit_somatic( read.getAlignmentStart() );
else emit( read.getAlignmentStart() );
if ( read.getAlignmentEnd() > coverage.getStop()) {
// should never happen
throw new StingException("Read "+read.getReadName()+": out of coverage window bounds");
throw new StingException("Read "+read.getReadName()+": out of coverage window bounds.\n"+
"Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+
read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+"; window start="+coverage.getStart()+
"; window end="+coverage.getStop());
}
coverage.add(read,ref);
if ( call_somatic ) {
String rg = (String)read.getAttribute("RG");
String sample = read.getHeader().getReadGroup(rg).getSample();
if ( normal_samples.contains(sample) ) {
normal_coverage.add(read,ref);
} else if ( tumor_samples.contains(sample) ) {
coverage.add(read,ref);
} else {
throw new StingException("Unrecognized sample: "+sample);
}
} else {
coverage.add(read, ref);
}
return 1;
}
/** Output indel calls up to the specified position and shift the coverage array(s): after this method is executed
* first elements of the coverage arrays map onto 'position'
*
* @param position
*/
private void emit(long position) {
for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) {
List<IndelVariant> variants = coverage.indelsAt(pos);
if ( variants.size() == 0 ) continue; // no indels
int cov = coverage.coverageAt(pos);
if ( cov < 6 ) continue; // low coverage
int total_variant_count = 0;
int max_variant_count = 0;
String indelString = null;
int event_length = 0; // length of the event on the reference
for ( IndelVariant var : variants ) {
int cnt = var.getCount();
total_variant_count +=cnt;
if ( cnt > max_variant_count ) {
max_variant_count = cnt;
indelString = var.getBases();
event_length = var.lengthOnRef();
}
}
if ( (double)total_variant_count > 0.3*cov && (double) max_variant_count > 0.7*total_variant_count ) {
try {
output.write(refName+"\t"+(pos-1)+"\t"+(event_length > 0 ? pos-1+event_length : pos-1)+
"\t"+(event_length>0? "-":"+")+indelString +":"+total_variant_count+"/"+cov+"\n");
} catch (IOException e) {
System.out.println(e.getMessage());
e.printStackTrace();
throw new StingException("Error encountered while writing into output BED file");
}
}
// for ( IndelVariant var : variants ) {
// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount());
// }
}
coverage.shift((int)(position - coverage.getStart() ) );
}
/** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed
* first elements of the coverage arrays map onto 'position'
*
* @param position
*/
private void emit_somatic(long position) {
for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) {
List<IndelVariant> tumor_variants = coverage.indelsAt(pos);
List<IndelVariant> normal_variants = normal_coverage.indelsAt(pos);
if ( tumor_variants.size() == 0 ) continue; // no indels in tumor
int tumor_cov = coverage.coverageAt(pos);
int normal_cov = normal_coverage.coverageAt(pos);
if ( tumor_cov < 6 ) continue; // low coverage
if ( normal_cov < 4 ) continue; // low coverage
int total_variant_count_tumor = 0;
int max_variant_count_tumor = 0;
String indelStringTumor = null;
int event_length_tumor = 0; // length of the event on the reference
for ( IndelVariant var : tumor_variants ) {
int cnt = var.getCount();
total_variant_count_tumor +=cnt;
if ( cnt > max_variant_count_tumor ) {
max_variant_count_tumor = cnt;
indelStringTumor = var.getBases();
event_length_tumor = var.lengthOnRef();
}
}
if ( (double)total_variant_count_tumor > 0.3*tumor_cov && (double) max_variant_count_tumor > 0.7*total_variant_count_tumor ) {
System.out.println(refName+"\t"+(pos-1)+"\t"+(event_length_tumor > 0 ? pos-1+event_length_tumor : pos-1)+
"\t"+(event_length_tumor >0? "-":"+")+indelStringTumor +":"+total_variant_count_tumor+"/"+tumor_cov);
if ( normal_variants.size() == 0 ) {
try {
output.write(refName+"\t"+(pos-1)+"\t"+(event_length_tumor > 0 ? pos-1+event_length_tumor : pos-1)+
"\t"+(event_length_tumor >0? "-":"+")+indelStringTumor +":"+total_variant_count_tumor+"/"+tumor_cov+"\n");
} catch (IOException e) {
System.out.println(e.getMessage());
e.printStackTrace();
throw new StingException("Error encountered while writing into output BED file");
}
System.out.println("INDICATION FOR SOMATIC\n---------------------------\n");
} else {
System.out.println("INDICATION FOR GERMLINE\n---------------------------\n");
}
}
// for ( IndelVariant var : variants ) {
// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount());
// }
}
coverage.shift((int)(position - coverage.getStart() ) );
normal_coverage.shift((int)(position - normal_coverage.getStart() ) );
}
@Override
public void onTraversalDone(Integer result) {
emit(1000000000); // emit everything we might have left
try {
output.close();
} catch (IOException e) {
System.out.println("Failed to close output BED file gracefully, data may be lost");
e.printStackTrace();
}
super.onTraversalDone(result);
}
@Override
public Integer reduce(Integer value, Integer sum) {
sum += value;
@ -99,6 +291,15 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
count += i;
}
/** Returns length of the event on the reference (number of deleted bases
* for deletions, -1 for insertions.
* @return
*/
public int lengthOnRef() {
if ( type == Type.D ) return bases.length();
else return -1;
}
public void increment() { count+=1; }
public int getCount() { return count; }
@ -235,8 +436,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
if ( type == null ) continue; // element was not an indel, go grab next element...
// we got an indel if we are here...
if ( i == 0 ) System.out.println("WARNING: Indel at the start of the read "+r.getReadName());
if ( i == nCigarElems - 1) System.out.println("WARNING: Indel at the end of the read "+r.getReadName());
if ( i == 0 ) logger.warn("Indel at the start of the read "+r.getReadName());
if ( i == nCigarElems - 1) logger.warn("Indel at the end of the read "+r.getReadName());
updateCount(indelPosition, type, bases);
}