a few tweaks to make it more robust: ignore reads with cigars containing anything but I,D,M; don't set up contig ordering manually, rely upon reference sequence and its dictionary; don't die if a record does not have NM tag, but faal back to direct counting instead; now requires reference as a cmdline arg

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@378 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-04-13 04:49:19 +00:00
parent 32e000bbfe
commit b4136b6d6e
5 changed files with 268 additions and 212 deletions

View File

@ -127,7 +127,7 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian
System.out.printf(" Exception caught during parsing Pileup line: %s%n", Utils.join(" <=> ", parts)); System.out.printf(" Exception caught during parsing Pileup line: %s%n", Utils.join(" <=> ", parts));
throw e; throw e;
} }
if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant at "+ loc.toString()); if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString());
} }
@ -243,10 +243,13 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian
} }
public String toMediumString() { public String toMediumString() {
String s = String.format("%s:%s:%s", getLocation().toString(), name, observedString);
String s = null;
if ( refBaseChar == '*' ) s = String.format("%s:%s:%s", getLocation().toString(), name,observedString);
else s = String.format("%s:%s:%s/%s", getLocation().toString(), name, observedAlleles.get(0),observedAlleles.get(1));
if ( isSNP() ) s += ": SNP"; if ( isSNP() ) s += ": SNP";
else { else {
if ( isInsertion() ) s += ": Insertoin"; if ( isInsertion() ) s += ": Insertion";
else { else {
if ( isDeletion() ) s+= ": Deletion"; if ( isDeletion() ) s+= ": Deletion";
else { else {

View File

@ -31,7 +31,13 @@ public class MendelianInheritanceWalker extends RefWalker<TrioConcordanceRecord
AllelicVariant dad = (rodSAMPileup)rodData.lookup("father", null); AllelicVariant dad = (rodSAMPileup)rodData.lookup("father", null);
AllelicVariant kid = (rodSAMPileup)rodData.lookup("daughter", null); AllelicVariant kid = (rodSAMPileup)rodData.lookup("daughter", null);
if ( ! hasCall(mom) || ! hasCall(dad) || ! hasCall(kid) ) return t; // at least one person is not called; nothing to do, bail out if ( hasCall(mom)) t.mom_assessed = 1;
if ( hasCall(dad)) t.dad_assessed = 1;
if ( hasCall(kid)) t.kid_assessed = 1;
if ( hasCall(mom) && mom.isIndel() ) System.out.println("GOT INDEL: "+mom.toString());
if (( t.mom_assessed + t.dad_assessed + t.kid_assessed) != 3 ) return t; // at least one person is not called; nothing to do, bail out
// proceed only if we got confident calls // proceed only if we got confident calls
@ -62,11 +68,11 @@ public class MendelianInheritanceWalker extends RefWalker<TrioConcordanceRecord
mom_alleles.contains(kid_allele_2) && dad_alleles.contains(kid_allele_1) ) { mom_alleles.contains(kid_allele_2) && dad_alleles.contains(kid_allele_1) ) {
t.consistent_snp = 1; t.consistent_snp = 1;
logger.info("consistent SNP at "+context.getLocation() + // logger.info("consistent SNP at "+context.getLocation() +
"("+ref+") " + mom_alleles.get(0)+"/" +mom_alleles.get(1) + " " + // "("+ref+") " + mom_alleles.get(0)+"/" +mom_alleles.get(1) + " " +
dad_alleles.get(0)+"/" +dad_alleles.get(1) + " " + // dad_alleles.get(0)+"/" +dad_alleles.get(1) + " " +
kid_allele_1+"/" +kid_allele_2 // kid_allele_1+"/" +kid_allele_2
); // );
return t; return t;
} }

View File

@ -27,9 +27,9 @@ public class IndelInspector extends CommandLineProgram {
@Option(doc="Output file (sam or bam) for non-indel related reads and indel reads that were not improved") public String OUT1; @Option(doc="Output file (sam or bam) for non-indel related reads and indel reads that were not improved") public String OUT1;
@Option(doc="Output file (sam or bam) for improved (realigned) indel related reads") public String OUT2; @Option(doc="Output file (sam or bam) for improved (realigned) indel related reads") public String OUT2;
@Option(doc="[paranoid] If true, all reads that would be otherwise picked and processed by this tool will be saved, unmodified, into OUT1", optional=true) public boolean CONTROL_RUN; @Option(doc="[paranoid] If true, all reads that would be otherwise picked and processed by this tool will be saved, unmodified, into OUT1", optional=true) public boolean CONTROL_RUN;
@Option(doc="Error counting mode: MM - count mismatches only, ERR - count errors (arachne style), MG - count mismatches and gaps as one error each") public String ERR_MODE; @Option(doc="Error counting mode: MM - mismatches only (from sam tags), MC - mismatches only doing actual mismatch count on the fly (use this if tags are incorrectly set); ERR - errors (arachne style: mm+gap lengths), MG - count mismatches and gaps as one error each") public String ERR_MODE;
@Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS; @Option(doc="Maximum number of errors allowed (see ERR_MODE)") public Integer MAX_ERRS;
// @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE; @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE;
/** Required main method implementation. */ /** Required main method implementation. */
public static void main(final String[] argv) { public static void main(final String[] argv) {
@ -38,25 +38,31 @@ public class IndelInspector extends CommandLineProgram {
protected int doWork() { protected int doWork() {
System.out.println("I am at version 0.3"); int discarded_count = 0;
ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
REF_FILE
);
if ( reference.getSequenceDictionary() == null ) {
System.out.println("No reference sequence dictionary found. Abort.");
}
GenomeLoc.setupRefContigOrdering(reference.getSequenceDictionary());
GenomeLoc location = null; GenomeLoc location = null;
if ( GENOME_LOCATION != null ) { if ( GENOME_LOCATION != null ) {
location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION); location = GenomeLoc.parseGenomeLoc(GENOME_LOCATION);
} }
if ( ! ERR_MODE.equals("MM") && ! ERR_MODE.equals("MG") && ! ERR_MODE.equals("ERR") ) { if ( ! ERR_MODE.equals("MM") && ! ERR_MODE.equals("MG") && ! ERR_MODE.equals("ERR") && ! ERR_MODE.equals("MC") ) {
System.out.println("Unknown value specified for ERR_MODE"); System.out.println("Unknown value specified for ERR_MODE: "+ERR_MODE);
return 1; return 1;
} }
final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1KG/")); final SAMFileReader samReader = new SAMFileReader(getInputFile(INPUT_FILE,"/broad/1KG/"));
samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
setContigOrdering(samReader); // setContigOrdering(samReader);
ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
);
ReferenceSequence contig_seq = null; ReferenceSequence contig_seq = null;
@ -105,7 +111,7 @@ public class IndelInspector extends CommandLineProgram {
} }
} }
// if contig is specified and wqe did not reach it yet, skip the records until we reach that contig: // if contig is specified and we did not reach it yet, skip the records until we reach that contig:
if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue; if ( location != null && GenomeLoc.compareContigs(cur_contig, location.getContig()) == -1 ) continue;
@ -114,7 +120,22 @@ public class IndelInspector extends CommandLineProgram {
// if stop position is specified and we are past that, stop reading: // if stop position is specified and we are past that, stop reading:
if ( location != null && r.getAlignmentStart() > location.getStop() ) break; if ( location != null && r.getAlignmentStart() > location.getStop() ) break;
if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY")==1 ) continue; // skip chrM and unplaced contigs for now // if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY") > 0 ) continue; // skip chrM and unplaced contigs for now
// we currently do not know how to deal with cigars containing elements other than M,I,D, so
// let's just skip the reads that contain those other elements (clipped reads?)
Cigar c = r.getCigar();
boolean cigar_acceptable = true;
for ( int z = 0 ; z < c.numCigarElements() ; z++ ) {
CigarElement ce = c.getCigarElement(z);
switch ( ce.getOperator() ) {
case M:
case I:
case D: break;
default: cigar_acceptable = false;
}
}
if ( ! cigar_acceptable ) continue;
int err = -1; int err = -1;
/* /*
@ -130,12 +151,13 @@ public class IndelInspector extends CommandLineProgram {
continue; continue;
*/ */
if ( ERR_MODE.equals("MM")) err = numMismatches(r); if ( ERR_MODE.equals("MM")) err = numMismatches(r,contig_seq);
else if ( ERR_MODE.equals("ERR")) err = numErrors(r); else if ( ERR_MODE.equals("MC") ) err = AlignmentUtils.numMismatches(r,contig_seq);
else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r); else if ( ERR_MODE.equals("ERR")) err = numErrors(r,contig_seq);
else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(r,contig_seq);
if ( err > MAX_ERRS.intValue() ) continue; if ( err > MAX_ERRS.intValue() ) continue;
// counter++; // counter++;
// if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString()); // if ( counter % 1000000 == 0 ) System.out.println(counter+" records; "+col.memStatsString());
col.receive(r); col.receive(r);
} }
@ -157,10 +179,10 @@ public class IndelInspector extends CommandLineProgram {
* @return number of errors (number of mismatches plus total length of all insertions/deletions * @return number of errors (number of mismatches plus total length of all insertions/deletions
* @throws RuntimeException * @throws RuntimeException
*/ */
private static int numErrors(SAMRecord r) throws RuntimeException { private static int numErrors(SAMRecord r, ReferenceSequence refseq) throws RuntimeException {
// NM currently stores the total number of mismatches in all blocks + 1 // NM currently stores the total number of mismatches in all blocks + 1
int errs = numMismatches(r); int errs = numMismatches(r,refseq);
// now we have to add the total length of all indels: // now we have to add the total length of all indels:
Cigar c = r.getCigar(); Cigar c = r.getCigar();
@ -185,10 +207,10 @@ public class IndelInspector extends CommandLineProgram {
* deletion will be counted as a single error regardless of the length) * deletion will be counted as a single error regardless of the length)
* @throws RuntimeException * @throws RuntimeException
*/ */
private static int numMismatchesGaps(SAMRecord r) throws RuntimeException { private static int numMismatchesGaps(SAMRecord r,ReferenceSequence refseq) throws RuntimeException {
// NM currently stores the total number of mismatches in all blocks + 1 // NM currently stores the total number of mismatches in all blocks + 1
int errs = numMismatches(r); int errs = numMismatches(r,refseq);
// now we have to add the total length of all indels: // now we have to add the total length of all indels:
Cigar c = r.getCigar(); Cigar c = r.getCigar();
@ -207,9 +229,11 @@ public class IndelInspector extends CommandLineProgram {
} }
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */ /** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */
private static int numMismatches(SAMRecord r) throws RuntimeException { private static int numMismatches(SAMRecord r, ReferenceSequence refseq) throws RuntimeException {
// NM currently stores the total number of mismatches in all blocks + 1 // NM currently stores the total number of mismatches in all blocks + 1
Integer i = (Integer)r.getAttribute("NM");
if ( i == null ) return AlignmentUtils.numMismatches(r,refseq);
return ((Integer)r.getAttribute("NM")).intValue() - 1; return ((Integer)r.getAttribute("NM")).intValue() - 1;
} }
@ -265,7 +289,7 @@ public class IndelInspector extends CommandLineProgram {
private void setDefaultContigOrdering() { private void setDefaultContigOrdering() {
Map<String,Integer> rco = new HashMap<String,Integer>(); Map<String,Integer> rco = new HashMap<String,Integer>();
rco.put("chrM",0); rco.put("chrM",0);
for ( int i = 1 ; i <= 22 ; i++ ) rco.put("chr"+i,i); for ( int i = 1 ; i <= 22 ; i++ ) rco.put(Integer.toString(i),i);//rco.put("chr"+i,i);
rco.put("chrX",23); rco.put("chrX",23);
rco.put("chrY",24); rco.put("chrY",24);
} }

View File

@ -230,7 +230,7 @@ public class IndelRecordPileCollector implements RecordReceiver {
// does nothing if alignment has no indels, otherwise adds the indels to the list and (re)sets state to 'active' // does nothing if alignment has no indels, otherwise adds the indels to the list and (re)sets state to 'active'
extractIndelsAndUpdateState(r.getCigar(),currPos); extractIndelsAndUpdateState(r.getCigar(),currPos);
if ( ! avoiding_region && mAllIndels.size() > 20 ) avoiding_region = true; if ( mState == ACTIVE_STATE && ( ! avoiding_region ) && ( mAllIndels.size() > 20 || mRecordPile.size() > 1000 ) ) avoiding_region = true;
if ( ! avoiding_region ) mRecordPile.add(r); // add new record if this is not some crazy region if ( ! avoiding_region ) mRecordPile.add(r); // add new record if this is not some crazy region
@ -254,9 +254,11 @@ public class IndelRecordPileCollector implements RecordReceiver {
long start = mAllIndels.first().getObject().getStart(); long start = mAllIndels.first().getObject().getStart();
long stop = mAllIndels.last().getObject().getStop(); long stop = mAllIndels.last().getObject().getStop();
System.out.println("Genomic region "+mLastContig+":"+ start + "-"+ stop + System.out.println("Genomic region "+mLastContig+":"+ start + "-"+ stop +
" was ignored: "+mAllIndels.size() +" unique indels with average distance of "+ " was ignored: ");
System.out.println(" "+mAllIndels.size() +" unique indels with average distance of "+
((double)(stop - start))/((double)mAllIndels.size()-1) + ((double)(stop - start))/((double)mAllIndels.size()-1) +
" bases between indels"); " bases between indels");
System.out.println(" "+mRecordPile.size() +" reads collected before aborting");
} }
// no indels or avoiding indels in bad region: send all records to defaultReceiver and clear the pile // no indels or avoiding indels in bad region: send all records to defaultReceiver and clear the pile

View File

@ -19,6 +19,15 @@ public class TrioConcordanceRecord {
public int inconsistent_indels_in_parent = 0 ; public int inconsistent_indels_in_parent = 0 ;
public int inconsistent_indels_in_kid = 0 ; public int inconsistent_indels_in_kid = 0 ;
public int non_biallelic = 0; // number of variant calls that are not biallelic public int non_biallelic = 0; // number of variant calls that are not biallelic
public long mom_assessed = 0; // number of assessed loci for mother (i.e. passing confidence threshold filter)
public long dad_assessed = 0;
public long kid_assessed = 0;
public long mom_ref = 0; // number of reference calls (out of total assessed)
public long dad_ref = 0;
public long kid_ref = 0;
public long mom_snp = 0; // number of snp calls (out of total assessed)
public long dad_snp = 0;
public long kid_snp = 0;
public TrioConcordanceRecord add(TrioConcordanceRecord other) { public TrioConcordanceRecord add(TrioConcordanceRecord other) {
this.assessed_loci += other.assessed_loci; this.assessed_loci += other.assessed_loci;
@ -32,6 +41,15 @@ public class TrioConcordanceRecord {
this.inconsistent_indels_in_parent += other.inconsistent_indels_in_parent; this.inconsistent_indels_in_parent += other.inconsistent_indels_in_parent;
this.inconsistent_indels_in_kid += other.inconsistent_indels_in_kid; this.inconsistent_indels_in_kid += other.inconsistent_indels_in_kid;
this.non_biallelic += other.non_biallelic; this.non_biallelic += other.non_biallelic;
this.mom_assessed += other.mom_assessed;
this.dad_assessed += other.dad_assessed;
this.kid_assessed += other.kid_assessed;
this.mom_ref += other.mom_ref;
this.dad_ref += other.dad_ref;
this.kid_ref += other.kid_ref;
this.mom_snp += other.mom_snp;
this.dad_snp += other.dad_snp;
this.kid_snp += other.kid_snp;
return this; return this;
} }
@ -39,8 +57,11 @@ public class TrioConcordanceRecord {
public int totalSNP() { return consistent_snp + inconsistent_snp + non_biallelic; } public int totalSNP() { return consistent_snp + inconsistent_snp + non_biallelic; }
public String toString() { public String toString() {
return String.format("assessed: %d; reference: %d (%3.2f); total snp: %d; consistent snp: %d (%3.2f); multiallelic: %d (%3.2f); " , return String.format("%ntotal assessed in trio: %d%n reference: %d (%3.2f)%n total snp: %d%n consistent snp: %d (%3.2f)%n multiallelic: %d (%3.2f)%nper trio individual:%n assessed:%n mother: %d%n father: %d%n daughter: %d%n" ,
assessed_loci, consistent_ref, ((double)consistent_ref*100.00)/assessed_loci,totalSNP(), consistent_snp, ((double)consistent_snp*100.0)/totalSNP(), assessed_loci, consistent_ref, ((double)consistent_ref*100.00)/assessed_loci,totalSNP(), consistent_snp, ((double)consistent_snp*100.0)/totalSNP(),
non_biallelic, ((double)non_biallelic*100.0)/totalSNP()); non_biallelic, ((double)non_biallelic*100.0)/totalSNP(),mom_assessed,dad_assessed,kid_assessed);
// return String.format("total assessed in trio: %d%n reference: %d (%3.2f)%n total snp: %d%n consistent snp: %d (%3.2f)%n multiallelic: %d (%3.2f)" ,
// assessed_loci, consistent_ref, ((double)consistent_ref*100.00)/assessed_loci,totalSNP(), consistent_snp, ((double)consistent_snp*100.0)/totalSNP(),
// non_biallelic, ((double)non_biallelic*100.0)/totalSNP());
} }
} }