Inconsequential changes, more 'variant classification' values are recognized

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5236 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2011-02-14 17:36:39 +00:00
parent d3660aa00e
commit 7f7d7eb2d1
3 changed files with 272 additions and 93 deletions

View File

@ -337,7 +337,45 @@ public class VariantContextAdaptors {
addGenotype(genotypes,tumorSample,maf.getObservedTumorAlleleList(),maf.getRefBases());
HashMap<String, Object> attrs = new HashMap<String, Object>(1);
HashMap<String, Object> attrs = new HashMap<String, Object>(10);
// fill attributes:
if ( maf.getHugoGeneSymbol() != null && ! maf.getHugoGeneSymbol().equals("Unknown"))
attrs.put("Gene",maf.getHugoGeneSymbol());
if ( maf.isSomatic() ) {
attrs.put(VCFConstants.SOMATIC_KEY,true);
attrs.put("SS","Somatic");
} else {
attrs.put("SS","Germline");
}
if ( maf.getVariantClassification() != null ) {
switch(maf.getVariantClassification()) {
case Intergenic: attrs.put("VC","Genomic"); break;
case Intron: attrs.put("VC","Intron"); break;
case Noncoding_transcript: attrs.put("VC","Noncoding_transcript"); break;
case UTR3: attrs.put("VC","3'UTR"); break;
case UTR5: attrs.put("VC","5'UTR"); break;
case Flank5: attrs.put("VC","5'flank"); break;
case Promoter: attrs.put("VC","5'flank"); break;
case De_novo_start: attrs.put("VC","De_novo_start"); break;
case Silent: attrs.put("VC","Silent"); break;
case Missense: attrs.put("VC","Missense"); break;
case Nonsense: attrs.put("VC","Nonsense"); break;
case Splice: attrs.put("VC","Splice_site"); break;
case miRNA: attrs.put("VC","miRNA"); break;
case Frameshift: attrs.put("VC","Frameshift"); break;
case Inframe: attrs.put("VC","Inframe"); break;
case Stop_deletion: attrs.put("VC","Stop_codon_deletion");
case Splice_site_deletion: attrs.put("VC","Splice_site_deletion");
case Splice_site_insertion: attrs.put("VC","Splice_site_insertion");
case Unclassified: attrs.put("VC","Unclassified");
default:
}
}
attrs.put("VT",maf.getType());
// attrs.put(VariantContext.ID_KEY, hapmap.getName());
int end = maf.getEnd();
VariantContext vc = new VariantContext(name, maf.getChr(), maf.getStart(), end, alleles,

View File

@ -27,13 +27,18 @@ package org.broadinstitute.sting.playground.gatk.features.maf;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.Feature;
import org.broad.tribble.TribbleException;
import org.broad.tribble.readers.LineReader;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import java.io.IOException;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
import java.lang.reflect.Field;
/**
* Created by IntelliJ IDEA.
@ -47,31 +52,21 @@ public class MafCodec implements FeatureCodec {
private int expectedTokenCount = -1;
private int BUILD_COL;
private int CHR_COL;
private int START_COL;
private int END_COL;
private int REF_ALLELE_COL;
private int TUMOR_ALLELE1_COL;
private int TUMOR_ALLELE2_COL;
private int TUMOR_SAMPLE_COL;
private int NORMAL_SAMPLE_COL;
// optional fields (absent from maf lite):
private int VARTYPE_COL = -1;
private int STRAND_COL = -1;
private static String BUILD_COLNAME="NCBI_Build";
private static String CHR_COLNAME="Chromosome";
private static String START_COLNAME="Start_position";
private static String END_COLNAME="End_position";
private static String REF_ALLELE_COLNAME="Reference_Allele";
private static String TUMOR_ALLELE1_COLNAME="Tumor_Seq_Allele1";
private static String TUMOR_ALLELE2_COLNAME="Tumor_Seq_Allele2";
private static String TUMOR_SAMPLE_COLNAME="Tumor_Sample_Barcode";
private static String NORMAL_SAMPLE_COLNAME="Matched_Norm_Sample_Barcode";
private Column BUILD_COL = new Column("NCBI_Build",true);
private Column CHR_COL = new Column("Chromosome",true);
private Column START_COL = new Column("Start_position",true);
private Column END_COL = new Column("End_position",true);
private Column REF_ALLELE_COL = new Column("Reference_Allele",true);
private Column TUMOR_ALLELE1_COL = new Column("Tumor_Seq_Allele1",true);
private Column TUMOR_ALLELE2_COL = new Column("Tumor_Seq_Allele2",true);
private Column TUMOR_SAMPLE_COL = new Column("Tumor_Sample_Barcode",true);
private Column NORMAL_SAMPLE_COL = new Column("Matched_Norm_Sample_Barcode",true);
// optional fields (absent from maf lite):
private static String VARTYPE_COLNAME="Variant_Type";
private static String STRAND_COLNAME="Strand";
private Column VARTYPE_COL = new Column("Variant_Type",false);
private Column STRAND_COL = new Column("Strand",false);
private Column HUGO_GENE_COL = new Column("Hugo_Symbol",false);
private Column VARCLASS_COL = new Column("Variant_Classification",false);
public enum MAF_TYPE {
@ -85,18 +80,55 @@ public class MafCodec implements FeatureCodec {
private MAF_TYPE mafType=MAF_TYPE.UNKNOWN;
private List<Column> allColumns = null; /// filled dynamically by constructor through introspection. Slow but less typing.
private boolean tooManyColsWarned = false;
private boolean tooFewColsWarned = false;
public MafCodec() {
allColumns = new ArrayList<Column>(30);
Field[] fields = this.getClass().getDeclaredFields();
try {
for ( Field f : fields ) {
if ( f.get(this) instanceof Column ) {
allColumns.add((Column)f.get(this));
}
}
} catch (IllegalAccessException e) {
throw new StingException("Error in MAFCodec when trying to introspect itself, this is probably a BUG",e);
}
}
/**
* Decode a line to obtain just its FeatureLoc for indexing -- contig, start, and stop.
*
* This method will NOT fill in the additional information available in the maf file
* @param line the input line to decode
* @return Return the FeatureLoc encoded by the line, or null if the line does not represent a feature (e.g. is
* a comment)
*/
public Feature decodeLoc(String line) {
return decode(line);
return reallyDecode(line,false);
}
/**
* Fully decode a line, will try extracting as much additional/annotation information from the maf file as it can.
* @param line the input line to decode
* @return Return the FeatureLoc encoded by the line, or null if the line does not represent a feature (e.g. is
* a comment)
*/
public Feature decode(String line) {
return reallyDecode(line,true);
}
/** Decodes a maf line. If <code>extra</code> is false, will decode only location and return;
* if <code>extra</code> is true, then extracts everything it can (samples, annotations, etc)
* @param line
* @param extra
* @return
*/
public Feature reallyDecode(String line, boolean extra) {
// ignore commented-out lines
if (line.startsWith("#")) return null;
@ -137,24 +169,39 @@ public class MafCodec implements FeatureCodec {
if (tokens.length < expectedTokenCount) {
log.error("MAF line contains too few columns ("+tokens.length+")");
return null;
if ( ! tooFewColsWarned ) {
log.error("MAF line contains too few columns ("+tokens.length+"); this error is reported only once.");
tooFewColsWarned = true;
}
}
if (tokens.length > expectedTokenCount) {
log.warn("MAF line contains more columns than expected ("+tokens.length+"); extra columns discarded");
if ( ! tooManyColsWarned ) {
log.warn("MAF line contains more columns than expected ("+tokens.length+"); extra columns discarded. This error is shown only once.");
tooManyColsWarned = true;
}
}
if ( tokens[CHR_COL].equals("Chromosome") ) return null; // if someone uses this codec manually and feeds it the header line multiple times...
if ( tokens[CHR_COL.getIndex()].equals("Chromosome") ) return null; // if someone uses this codec manually and feeds it the header line multiple times...
// create a new feature from the line:
int start = Integer.valueOf(tokens[START_COL]);
int stop = Integer.valueOf(tokens[END_COL]);
int start = 0;
try {
start = Integer.parseInt(START_COL.getValue(tokens));
} catch (NumberFormatException e) {
throw new UserException.MalformedFile("Missing or non-numeric start position in line:\n"+line,e);
}
int stop = 0 ;
try {
stop = Integer.parseInt(END_COL.getValue(tokens));
} catch (NumberFormatException e) {
throw new UserException.MalformedFile("Missing or non-numeric stop position in line:\n"+line,e);
}
String eventType="UNKNOWN";
String ref = tokens[REF_ALLELE_COL];
String alt1 = tokens[TUMOR_ALLELE1_COL];
String alt2 = tokens[TUMOR_ALLELE2_COL];
String ref = REF_ALLELE_COL.getValue(tokens);
String alt1 = TUMOR_ALLELE1_COL.getValue(tokens);
String alt2 = TUMOR_ALLELE2_COL.getValue(tokens);
if ( ref.equals("-") ) {
// insertion
@ -208,16 +255,29 @@ public class MafCodec implements FeatureCodec {
}
}
// if we got vartype column, make sure it makes sense:
if ( VARTYPE_COL != -1 && ! tokens[VARTYPE_COL].equals(eventType) )
throw new UserException.MalformedFile("Inconsistency in MAF: variant looks like a "+eventType +" but annotated as "+
tokens[VARTYPE_COL]);
if ( VARTYPE_COL.isSet(tokens) && ! tokens[VARTYPE_COL.getIndex()].equals(eventType) ) {
// special case: we annotate everything as MNP while MAF can have DNP/TNP, these are fine:
if ( eventType == MNP && (
tokens[VARTYPE_COL.getIndex()].equals("DNP") && ref.length() == 2 ||
tokens[VARTYPE_COL.getIndex()].equals("TNP") && ref.length() == 3)
) {} // these are fine
else {
throw new UserException.MalformedFile("Inconsistency in MAF: variant looks like a "+eventType +" but annotated as "+
tokens[VARTYPE_COL.getIndex()]);
}
}
MafFeature feature = new MafFeature(CHR_COL.getValue(tokens),start,stop);
MafFeature feature = new MafFeature(tokens[CHR_COL],start,stop);
if ( ! extra ) return feature; // ignore additional fields unless we were explicitly asked to read those!
feature.setVariantType(eventType);
feature.setRefAllele(ref);
feature.setObservedTumor(alt1,alt2);
feature.setTumorSample(tokens[TUMOR_SAMPLE_COL]);
feature.setNormalSample(tokens[NORMAL_SAMPLE_COL]);
feature.setTumorSample(TUMOR_SAMPLE_COL.getValue(tokens));
feature.setNormalSample(NORMAL_SAMPLE_COL.getValue(tokens));
if ( HUGO_GENE_COL.isSet(tokens) ) feature.setHugoGeneSymbol(tokens[HUGO_GENE_COL.getIndex()]);
if ( VARCLASS_COL.isSet(tokens) ) feature.setVariantClassification(tokens[VARCLASS_COL.getIndex()]);
return feature;
}
@ -239,67 +299,95 @@ public class MafCodec implements FeatureCodec {
*
*/
private void setMafLiteCols() {
BUILD_COL = 0;
CHR_COL = 1;
START_COL = 2;
END_COL = 3;
REF_ALLELE_COL = 4;
TUMOR_ALLELE1_COL = 5;
TUMOR_ALLELE2_COL = 6;
TUMOR_SAMPLE_COL = 7;
NORMAL_SAMPLE_COL = 8;
BUILD_COL.setIndex(0);
CHR_COL.setIndex(1);
START_COL.setIndex(2);
END_COL.setIndex(3);
REF_ALLELE_COL.setIndex(4);
TUMOR_ALLELE1_COL.setIndex(5);
TUMOR_ALLELE2_COL.setIndex(6);
TUMOR_SAMPLE_COL.setIndex(7);
NORMAL_SAMPLE_COL.setIndex(8);
}
private void setMafAnnotatedCols() {
BUILD_COL = 3;
CHR_COL = 4;
START_COL = 5;
END_COL = 6;
REF_ALLELE_COL = 10;
TUMOR_ALLELE1_COL = 11;
TUMOR_ALLELE2_COL = 12;
TUMOR_SAMPLE_COL = 15;
NORMAL_SAMPLE_COL = 16;
VARTYPE_COL = 9;
STRAND_COL = 7;
BUILD_COL.setIndex(3);
CHR_COL.setIndex(4);
START_COL.setIndex(5);
END_COL.setIndex(6);
REF_ALLELE_COL.setIndex(10);
TUMOR_ALLELE1_COL.setIndex(11);
TUMOR_ALLELE2_COL.setIndex(12);
TUMOR_SAMPLE_COL.setIndex(15);
NORMAL_SAMPLE_COL.setIndex(16);
VARTYPE_COL.setIndex(9);
STRAND_COL.setIndex(7);
VARCLASS_COL.setIndex(8);
HUGO_GENE_COL.setIndex(0);
}
private void setColumnsFromHeader(String[] tokens) {
Map<String,Integer> colNames = new HashMap<String,Integer>();
for ( int i = 0 ; i < tokens.length ; i++ ) colNames.put(tokens[i],i);
if ( colNames.containsKey(BUILD_COLNAME) ) BUILD_COL = colNames.get(BUILD_COLNAME);
else throw new UserException.MalformedFile("Maf file does not have "+BUILD_COLNAME+" column");
if ( colNames.containsKey(CHR_COLNAME) ) CHR_COL = colNames.get(CHR_COLNAME);
else throw new UserException.MalformedFile("Maf file does not have "+CHR_COLNAME+" column");
if ( colNames.containsKey(START_COLNAME) ) START_COL = colNames.get(START_COLNAME);
else throw new UserException.MalformedFile("Maf file does not have "+START_COLNAME+" column");
if ( colNames.containsKey(END_COLNAME) ) END_COL = colNames.get(END_COLNAME);
else throw new UserException.MalformedFile("Maf file does not have "+END_COLNAME+" column");
if ( colNames.containsKey(REF_ALLELE_COLNAME) ) REF_ALLELE_COL = colNames.get(REF_ALLELE_COLNAME);
else throw new UserException.MalformedFile("Maf file does not have "+REF_ALLELE_COLNAME+" column");
if ( colNames.containsKey(TUMOR_ALLELE1_COLNAME) ) TUMOR_ALLELE1_COL = colNames.get(TUMOR_ALLELE1_COLNAME);
else throw new UserException.MalformedFile("Maf file does not have "+TUMOR_ALLELE1_COLNAME+" column");
if ( colNames.containsKey(TUMOR_ALLELE2_COLNAME) ) TUMOR_ALLELE2_COL = colNames.get(TUMOR_ALLELE2_COLNAME);
else throw new UserException.MalformedFile("Maf file does not have "+TUMOR_ALLELE2_COLNAME+" column");
if ( colNames.containsKey(TUMOR_SAMPLE_COLNAME) ) TUMOR_SAMPLE_COL = colNames.get(TUMOR_SAMPLE_COLNAME);
else throw new UserException.MalformedFile("Maf file does not have "+TUMOR_SAMPLE_COLNAME+" column");
if ( colNames.containsKey(NORMAL_SAMPLE_COLNAME) ) NORMAL_SAMPLE_COL = colNames.get(NORMAL_SAMPLE_COLNAME);
else throw new UserException.MalformedFile("Maf file does not have "+NORMAL_SAMPLE_COLNAME+" column");
// we do not require variant type column but we use it if it's present (for validation):
if ( colNames.containsKey(VARTYPE_COLNAME) ) VARTYPE_COL = colNames.get(VARTYPE_COLNAME);
// we do not require strand column but we use it if it's present (for validation):
if ( colNames.containsKey(STRAND_COLNAME) ) STRAND_COL = colNames.get(STRAND_COLNAME);
for ( Column c : allColumns ) c.setFromMap(colNames);
}
}
class Column {
int index ;
String name;
boolean required;
Column(String name, boolean required) {
this.name = name;
this.required = required;
this.index = -1;
}
public String getName() { return name; }
public void setName(String name) { this.name = name; }
public int getIndex() { return index; }
public void setIndex(int index) { this.index = index; }
public String getValue(String[] fields) {
if ( index < fields.length ) return fields[index];
if ( required ) throw new UserException.MalformedFile("In MAF file: required column "+name+" has index "+index+
", but only "+fields.length+ " fields are present in maf line");
return null;
}
/** Sets this column's index from the provided name->index map (i.e. searches for itself in the map).
* If column not found, <code>throw_exception</code> is true <i>AND</i> this column is required, then an exception will
* be thrown right away; otherwise returns quietely even if map does not contain this column.
* @param m
* @param throw_exception
*/
public void setFromMap(Map<String,Integer> m, boolean throw_exception) {
Integer i = m.get(this.name);
if ( i == null ) {
if ( this.required && throw_exception ) throw new UserException.MalformedFile("Required column "+this.name+" is missing from the maf file");
index = -1;
return; // not found
}
this.index = i.intValue(); // found and set.
}
/** Sets this column's index from the provided name->index map (i.e. searches for itself in the map).
* If this column is required but not found in the map, then an exception will
* be thrown.
* @param m
*/
public void setFromMap(Map<String,Integer> m) {
setFromMap(m,true);
}
public boolean isSet() { return index > -1; }
public boolean isSet(String[] fields) { return index > -1 && index < fields.length; }
}

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.playground.gatk.features.maf;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.*;
@ -47,11 +48,18 @@ public class MafFeature implements Feature {
private String[] observedNormAlleles = null; // The sequences of the observed alleles in normal
private String tumorSampleId = null;
private String normalSampleId = null;
private String hugoSymbol = null;
private Classification classification = null;
public enum Type {
UNKNOWN,SNP,MNP,INS,DEL
};
public enum Classification {
Unclassified, Intergenic,Intron,Noncoding_transcript,UTR3,UTR5,Flank5,Silent,Missense, Nonsense, Splice, miRNA,
Frameshift, Inframe, Stop_deletion, Promoter,De_novo_start,Splice_site_deletion,Splice_site_insertion
}
private Type type = Type.UNKNOWN;
/**
@ -99,6 +107,14 @@ public class MafFeature implements Feature {
return refAllele;
}
public String getHugoGeneSymbol() {
return hugoSymbol;
}
public String setHugoGeneSymbol(String genename) {
return hugoSymbol = genename;
}
/**
* Returns list of alleles (represented as strings) observed in Tumor. Returned alleles
* could be redundant (e.g. if we have homozygous non-ref at ploidy 2+).
@ -187,6 +203,43 @@ public class MafFeature implements Feature {
}
}
public boolean isSomatic() {
if ( observedTumAlleles[0].equals(refAllele) && observedTumAlleles[1].equals(refAllele) ) return false; // tumor is ref
// we get here only if tumor is non-ref
if ( observedNormAlleles == null ) return true; // norm alleles are omitted from maf only if they are all ref
if ( observedNormAlleles[0].equals(refAllele) && observedNormAlleles[1].equals(refAllele) ) return true;
return false;
}
public void setVariantClassification(String s) {
if ( s.equals("IGR") ) { classification = Classification.Intergenic ; return; }
if ( s.equals("Intron") ) { classification = Classification.Intron ; return; }
if ( s.equals("3'UTR") ) { classification = Classification.UTR3 ; return; }
if ( s.equals("5'UTR") ) { classification = Classification.UTR5 ; return; }
if ( s.equals("5'-Flank") ) { classification = Classification.Flank5 ; return; }
if ( s.equals("Silent") ) { classification = Classification.Silent ; return; }
if ( s.equals("Non-coding_Transcript")) { classification = Classification.Noncoding_transcript; return; }
if ( s.equals("Missense") || s.equals("Missense_Mutation") ) { classification = Classification.Missense ; return; }
if ( s.equals("Nonsense_Mutation") ) { classification = Classification.Nonsense ; return; }
if ( s.equals("Splice_Site") ) { classification = Classification.Splice ; return; }
if ( s.equals("miRNA") ) { classification = Classification.miRNA ; return; }
if ( s.equals("Frame_Shift_Ins") ) { classification = Classification.Frameshift ; return; }
if ( s.equals("Frame_Shift_Del") ) { classification = Classification.Frameshift ; return; }
if ( s.equals("In_Frame_Ins") ) { classification = Classification.Inframe ; return; }
if ( s.equals("In_Frame_Del") ) { classification = Classification.Inframe ; return; }
if ( s.equals("Stop_Codon_Del") ) { classification = Classification.Stop_deletion ; return; }
if ( s.equals("Splice_Site_Del") ) { classification = Classification.Splice_site_deletion ; return; }
if ( s.equals("Splice_Site_Ins") ) { classification = Classification.Splice_site_insertion ; return; }
if ( s.equals("Promoter") ) { classification = Classification.Promoter ; return; }
if ( s.equals("De_novo_Start") ) { classification = Classification.De_novo_start ; return; }
if ( s.equals("TX-REF-MISMATCH") ) { classification = Classification.Unclassified ; return; }
throw new UserException.MalformedFile("Unknown variant classification: " + s);
}
public Classification getVariantClassification() {
return classification;
}
/*
* the required getting and setter methods
*/