diff --git a/public/R/src/gsalib/R/gsa.read.gatkreport.R b/public/R/src/gsalib/R/gsa.read.gatkreport.R index 9b3ef1ad1..011b5240d 100644 --- a/public/R/src/gsalib/R/gsa.read.gatkreport.R +++ b/public/R/src/gsalib/R/gsa.read.gatkreport.R @@ -20,6 +20,20 @@ assign(tableName, d, envir=tableEnv); } +# Read a fixed width line of text into a list. +.gsa.splitFixedWidth <- function(line, columnStarts) { + splitStartStop <- function(x) { + x = substring(x, starts, stops); + x = gsub("^[[:space:]]+|[[:space:]]+$", "", x); + x; + } + + starts = c(1, columnStarts); + stops = c(columnStarts - 1, nchar(line)); + + sapply(line, splitStartStop)[,1]; +} + # Load all GATKReport tables from a file gsa.read.gatkreport <- function(filename) { con = file(filename, "r", blocking = TRUE); @@ -31,9 +45,10 @@ gsa.read.gatkreport <- function(filename) { tableName = NA; tableHeader = c(); tableRows = c(); + version = NA; for (line in lines) { - if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) { + if (length(grep("^##:GATKReport.v", line, ignore.case=TRUE)) > 0) { headerFields = unlist(strsplit(line, "[[:space:]]+")); if (!is.na(tableName)) { @@ -43,13 +58,37 @@ gsa.read.gatkreport <- function(filename) { tableName = headerFields[2]; tableHeader = c(); tableRows = c(); + + # For differences in versions see + # $STING_HOME/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java + if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) { + version = "v0.1"; + + } else if (length(grep("^##:GATKReport.v0.2[[:space:]]+", line, ignore.case=TRUE)) > 0) { + version = "v0.2"; + columnStarts = c(); + + } + } else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) { # do nothing } else if (!is.na(tableName)) { - row = unlist(strsplit(line, "[[:space:]]+")); + + if (version == "v0.1") { + row = unlist(strsplit(line, "[[:space:]]+")); + + } else if (version == "v0.2") { + if (length(tableHeader) == 0) { + headerChars = unlist(strsplit(line, "")); + # Find the first position of non space characters, excluding the first character + columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1); + } + + row = .gsa.splitFixedWidth(line, columnStarts); + } if (length(tableHeader) == 0) { - tableHeader = row; + tableHeader = row; } else { tableRows = rbind(tableRows, row); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java index 1da03e9c2..ebb4cbe66 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java @@ -87,8 +87,8 @@ public class VCFWriterStorage implements Storage, VCFWriter { writer.writeHeader(stub.getVCFHeader()); } - public void add(VariantContext vc, byte ref) { - writer.add(vc, ref); + public void add(VariantContext vc) { + writer.add(vc); } /** @@ -117,7 +117,7 @@ public class VCFWriterStorage implements Storage, VCFWriter { BasicFeatureSource source = BasicFeatureSource.getFeatureSource(file.getAbsolutePath(), new VCFCodec(), false); for ( VariantContext vc : source.iterator() ) { - target.writer.add(vc, vc.getReferenceBaseForIndel()); + target.writer.add(vc); } source.close(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java index bb84f9457..7a110fde5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java @@ -192,8 +192,8 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * @{inheritDoc} */ - public void add(VariantContext vc, byte ref) { - outputTracker.getStorage(this).add(vc,ref); + public void add(VariantContext vc) { + outputTracker.getStorage(this).add(vc); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 4db5d8ab6..d6f8bab9b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -127,14 +127,13 @@ public class VariantContextAdaptors { Map attributes = new HashMap(); attributes.put(VariantContext.ID_KEY, dbsnp.getRsID()); - if ( sawNullAllele ) { - int index = dbsnp.getStart() - ref.getWindow().getStart() - 1; - if ( index < 0 ) - return null; // we weren't given enough reference context to create the VariantContext - attributes.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, new Byte(ref.getBases()[index])); - } - Collection genotypes = null; - VariantContext vc = new VariantContext(name, dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0),dbsnp.getEnd(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes); + int index = dbsnp.getStart() - ref.getWindow().getStart() - 1; + if ( index < 0 ) + return null; // we weren't given enough reference context to create the VariantContext + Byte refBaseForIndel = new Byte(ref.getBases()[index]); + + Map genotypes = null; + VariantContext vc = new VariantContext(name, dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0), dbsnp.getEnd(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes, refBaseForIndel); return vc; } else return null; // can't handle anything else diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java index 59d496828..dc3a617e7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java @@ -1,21 +1,23 @@ package org.broadinstitute.sting.gatk.report; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; import java.io.*; +import java.util.List; import java.util.TreeMap; /** * Container class for GATK report tables */ public class GATKReport { - private TreeMap tables; + private TreeMap tables = new TreeMap(); /** * Create a new, empty GATKReport. */ public GATKReport() { - tables = new TreeMap(); } /** @@ -23,7 +25,7 @@ public class GATKReport { * @param filename the path to the file to load */ public GATKReport(String filename) { - loadReport(new File(filename)); + this(new File(filename)); } /** @@ -31,7 +33,6 @@ public class GATKReport { * @param file the file to load */ public GATKReport(File file) { - tables = new TreeMap(); loadReport(file); } @@ -46,11 +47,17 @@ public class GATKReport { GATKReportTable table = null; String[] header = null; int id = 0; + GATKReportVersion version = null; + List columnStarts = null; String line; while ( (line = reader.readLine()) != null ) { - if (line.startsWith("##:GATKReport.v0.1 ")) { - line = line.replaceFirst("##:GATKReport.v0.1 ", ""); + + if (line.startsWith("##:GATKReport.v")) { + + version = GATKReportVersion.fromHeader(line); + + line = line.replaceFirst("##:GATKReport." + version.versionString + " ", ""); String[] pieces = line.split(" : "); String tableName = pieces[0]; @@ -58,14 +65,35 @@ public class GATKReport { addTable(tableName, tableDesc); table = getTable(tableName); + table.setVersion(version); header = null; - } else if ( line.isEmpty() ) { + columnStarts = null; + } else if ( line.trim().isEmpty() ) { // do nothing } else { if (table != null) { + + String[] splitLine; + + switch (version) { + case V0_1: + splitLine = TextFormattingUtils.splitWhiteSpace(line); + break; + + case V0_2: + if (header == null) { + columnStarts = TextFormattingUtils.getWordStarts(line); + } + splitLine = TextFormattingUtils.splitFixedWidth(line, columnStarts); + break; + + default: + throw new ReviewedStingException("GATK report version parsing not implemented for: " + line); + } + if (header == null) { - header = line.split("\\s+"); + header = splitLine; table.addPrimaryKey("id", false); @@ -75,10 +103,8 @@ public class GATKReport { id = 0; } else { - String[] entries = line.split("\\s+"); - for (int columnIndex = 0; columnIndex < header.length; columnIndex++) { - table.set(id, header[columnIndex], entries[columnIndex]); + table.set(id, header[columnIndex], splitLine[columnIndex]); } id++; @@ -125,7 +151,10 @@ public class GATKReport { * @return the table object */ public GATKReportTable getTable(String tableName) { - return tables.get(tableName); + GATKReportTable table = tables.get(tableName); + if (table == null) + throw new ReviewedStingException("Table is not in GATKReport: " + tableName); + return table; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java index 440597754..1c46b3bac 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java @@ -37,10 +37,10 @@ public class GATKReportColumn extends TreeMap { * tables, as the table gets written properly without having to waste storage for the unset elements (usually the zero * values) in the table. * - * @param primaryKey the primary key position in the column that should be set + * @param primaryKey the primary key position in the column that should be retrieved * @return the value at the specified position in the column, or the default value if the element is not set */ - public Object getWithoutSideEffects(Object primaryKey) { + private Object getWithoutSideEffects(Object primaryKey) { if (!this.containsKey(primaryKey)) { return defaultValue; } @@ -48,6 +48,16 @@ public class GATKReportColumn extends TreeMap { return this.get(primaryKey); } + /** + * Return an object from the column, but if it doesn't exist, return the default value. + * + * @param primaryKey the primary key position in the column that should be retrieved + * @return the string value at the specified position in the column, or the default value if the element is not set + */ + public String getStringValue(Object primaryKey) { + return toString(getWithoutSideEffects(primaryKey)); + } + /** * Return the displayable property of the column. If true, the column will be displayed in the final output. * If not, printing will be suppressed for the contents of the table. @@ -67,7 +77,7 @@ public class GATKReportColumn extends TreeMap { for (Object obj : this.values()) { if (obj != null) { - int width = obj.toString().length(); + int width = toString(obj).length(); if (width > maxWidth) { maxWidth = width; @@ -77,4 +87,23 @@ public class GATKReportColumn extends TreeMap { return maxWidth; } + + /** + * Returns a string version of the values. + * @param obj The object to convert to a string + * @return The string representation of the column + */ + private static String toString(Object obj) { + String value; + if (obj == null) { + value = "null"; + } else if (obj instanceof Float) { + value = String.format("%.8f", (Float) obj); + } else if (obj instanceof Double) { + value = String.format("%.8f", (Double) obj); + } else { + value = obj.toString(); + } + return value; + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportParserUnitTest.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java similarity index 50% rename from public/java/test/org/broadinstitute/sting/gatk/report/GATKReportParserUnitTest.java rename to public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java index cfd75c41a..a33631c85 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportParserUnitTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java @@ -24,26 +24,32 @@ package org.broadinstitute.sting.gatk.report; -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.Test; +import java.util.*; -import java.io.File; +/** + * Tracks a linked list of GATKReportColumn in order by name. + */ +public class GATKReportColumns extends LinkedHashMap { + private List columnNames = new ArrayList(); -public class GATKReportParserUnitTest extends BaseTest { - @Test - public void testParse() throws Exception { - GATKReportParser parser = new GATKReportParser(); - parser.parse(new File(validationDataLocation + "exampleGATKReport.eval")); + /** + * Returns the column by index + * @param i the index + * @return The column + */ + public GATKReportColumn getByIndex(int i) { + return get(columnNames.get(i)); + } - Assert.assertEquals(parser.getValue("CountVariants", "none.eval.none.all", "nProcessedLoci"), "100000"); - Assert.assertEquals(parser.getValue("CountVariants", "none.eval.none.all", "nNoCalls"), "99872"); + @Override + public GATKReportColumn remove(Object key) { + columnNames.remove(key); + return super.remove(key); + } - Assert.assertEquals(parser.getValue("SimpleMetricsByAC.metrics", "none.eval.none.novel.ac2", "AC"), "2"); - Assert.assertNull(parser.getValue("SimpleMetricsByAC.metrics", "none.eval.none.novel.ac2.bad", "AC")); - Assert.assertNull(parser.getValue("SimpleMetricsByAC.metrics", "none.eval.none.novel.ac2", "AC.bad")); - Assert.assertNull(parser.getValue("SimpleMetricsByAC.metrics.bad", "none.eval.none.novel.ac2", "AC")); - - Assert.assertEquals(parser.getValue("ValidationReport", "none.eval.none.known", "sensitivity"), "NaN"); + @Override + public GATKReportColumn put(String key, GATKReportColumn value) { + columnNames.add(key); + return super.put(key, value); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportParser.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportParser.java deleted file mode 100644 index 6915d5cb2..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportParser.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.report; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; - -public class GATKReportParser { - private List tables = new ArrayList(); - - public void parse(File file) throws IOException { - InputStream stream = FileUtils.openInputStream(file); - try { - parse(stream); - } finally { - IOUtils.closeQuietly(stream); - } - } - - public void parse(InputStream input) throws IOException { - GATKReportTableParser table = null; - - for (String line: new XReadLines(input)) { - if (line.startsWith("##:GATKReport.v0.1 ")) { - table = newTableParser(line); - tables.add(table); - table.parse(line); - } else if (table != null) { - if (line.trim().length() == 0) - table = null; - else - table.parse(line); - } - } - } - - public String getValue(String tableName, String[] key, String column) { - for (GATKReportTableParser table: tables) - if (table.getTableName().equals(tableName)) - return table.getValue(key, column); - return null; - } - - public String getValue(String tableName, String key, String column) { - for (GATKReportTableParser table: tables) - if (table.getTableName().equals(tableName)) - return table.getValue(key, column); - return null; - } - - private GATKReportTableParser newTableParser(String header) { - return new GATKReportTableParser(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index f7ea25696..5d38295f5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.report; +import org.apache.commons.lang.ObjectUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.PrintStream; @@ -88,17 +89,20 @@ import java.util.regex.Pattern; * but at least the prototype contained herein works. * * @author Kiran Garimella + * @author Khalid Shakir */ public class GATKReportTable { + private static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V0_2; private String tableName; private String tableDescription; + private GATKReportVersion version = LATEST_REPORT_VERSION; private String primaryKeyName; private Collection primaryKeyColumn; private boolean primaryKeyDisplay; - boolean sortByPrimaryKey = true; + private boolean sortByPrimaryKey = true; - private LinkedHashMap columns; + private GATKReportColumns columns; /** * Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed @@ -113,6 +117,19 @@ public class GATKReportTable { return !m.find(); } + /** + * Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed + * + * @param description the name of the table or column + * @return true if the name is valid, false if otherwise + */ + private boolean isValidDescription(String description) { + Pattern p = Pattern.compile("\\r|\\n"); + Matcher m = p.matcher(description); + + return !m.find(); + } + /** * Construct a new GATK report table with the specified name and description * @@ -128,11 +145,23 @@ public class GATKReportTable { throw new ReviewedStingException("Attempted to set a GATKReportTable name of '" + tableName + "'. GATKReportTable names must be purely alphanumeric - no spaces or special characters are allowed."); } + if (!isValidDescription(tableDescription)) { + throw new ReviewedStingException("Attempted to set a GATKReportTable description of '" + tableDescription + "'. GATKReportTable descriptions must not contain newlines."); + } + this.tableName = tableName; this.tableDescription = tableDescription; this.sortByPrimaryKey = sortByPrimaryKey; - columns = new LinkedHashMap(); + columns = new GATKReportColumns(); + } + + public GATKReportVersion getVersion() { + return version; + } + + protected void setVersion(GATKReportVersion version) { + this.version = version; } /** @@ -161,6 +190,57 @@ public class GATKReportTable { primaryKeyDisplay = display; } + /** + * Returns the first primary key matching the dotted column values. + * Ex: dbsnp.eval.called.all.novel.all + * @param dottedColumnValues Period concatenated values. + * @return The first primary key matching the column values or throws an exception. + */ + public Object getPrimaryKey(String dottedColumnValues) { + Object key = findPrimaryKey(dottedColumnValues); + if (key == null) + throw new ReviewedStingException("Attempted to get non-existent GATKReportTable key for values: " + dottedColumnValues); + return key; + } + + /** + * Returns true if there is at least on row with the dotted column values. + * Ex: dbsnp.eval.called.all.novel.all + * @param dottedColumnValues Period concatenated values. + * @return true if there is at least one row matching the columns. + */ + public boolean containsPrimaryKey(String dottedColumnValues) { + return findPrimaryKey(dottedColumnValues) != null; + } + + /** + * Returns the first primary key matching the dotted column values. + * Ex: dbsnp.eval.called.all.novel.all + * @param dottedColumnValues Period concatenated values. + * @return The first primary key matching the column values or null. + */ + private Object findPrimaryKey(String dottedColumnValues) { + return findPrimaryKey(dottedColumnValues.split("\\.")); + } + + /** + * Returns the first primary key matching the column values. + * Ex: new String[] { "dbsnp", "eval", "called", "all", "novel", "all" } + * @param columnValues column values. + * @return The first primary key matching the column values. + */ + private Object findPrimaryKey(Object[] columnValues) { + for (Object primaryKey : primaryKeyColumn) { + boolean matching = true; + for (int i = 0; matching && i < columnValues.length; i++) { + matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i+1)); + } + if (matching) + return primaryKey; + } + return null; + } + /** * Add a column to the report and specify the default value that should be supplied if a given position in the table is never explicitly set. * @@ -230,6 +310,17 @@ public class GATKReportTable { return columns.get(columnName).get(primaryKey); } + /** + * Get a value from the given position in the table + * + * @param primaryKey the primary key value + * @param columnIndex the index of the column + * @return the value stored at the specified position in the table + */ + private Object get(Object primaryKey, int columnIndex) { + return columns.getByIndex(columnIndex).get(primaryKey); + } + /** * Increment an element in the table. This implementation is awful - a functor would probably be better. * @@ -515,7 +606,7 @@ public class GATKReportTable { String primaryKeyFormat = "%-" + getPrimaryKeyColumnWidth() + "s"; // Emit the table definition - out.printf("##:GATKReport.v0.1 %s : %s%n", tableName, tableDescription); + out.printf("##:GATKReport.%s %s : %s%n", LATEST_REPORT_VERSION.versionString, tableName, tableDescription); // Emit the table header, taking into account the padding requirement if the primary key is a hidden column boolean needsPadding = false; @@ -545,22 +636,8 @@ public class GATKReportTable { for (String columnName : columns.keySet()) { if (columns.get(columnName).isDisplayable()) { - Object obj = columns.get(columnName).getWithoutSideEffects(primaryKey); - if (needsPadding) { out.printf(" "); } - - String value = "null"; - if (obj != null) { - if (obj instanceof Float) { - value = String.format("%.8f", (Float) obj); - } else if (obj instanceof Double) { - value = String.format("%.8f", (Double) obj); - } else { - value = obj.toString(); - } - } - - //out.printf(columnWidths.get(columnName), obj == null ? "null" : obj.toString()); + String value = columns.get(columnName).getStringValue(primaryKey); out.printf(columnWidths.get(columnName), value); needsPadding = true; diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTableParser.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTableParser.java deleted file mode 100644 index 6fd9f9627..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTableParser.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.report; - -import org.apache.commons.lang.StringUtils; - -import java.util.*; - -public class GATKReportTableParser { - private int lineNum = 0; - private String[] descriptions; - private Map headers = new HashMap(); - private List values = new ArrayList(); - - public void parse(String line) { - lineNum++; - switch (lineNum) { - case 1: - descriptions = parseLine(line); - case 2: - String[] columnHeaders = parseLine(line); - for (int i = 0; i < columnHeaders.length; i++) - headers.put(columnHeaders[i], i); - default: - values.add(parseLine(line)); - } - } - - public String getTableName() { - return descriptions[1]; - } - - public String getValue(String[] key, String column) { - if (!headers.containsKey(column)) - return null; - for (String[] row: values) - if (Arrays.equals(key, Arrays.copyOfRange(row, 1, key.length + 1))) - return row[headers.get(column)]; - return null; - } - - public String getValue(String key, String column) { - return getValue(key.split("\\."), column); - } - - private String generateKey(String[] row, int i) { - return StringUtils.join(row, ".", 0, i); - } - - private String[] parseLine(String line) { - return line.split(" +"); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java new file mode 100644 index 000000000..5f1159a43 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.report; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +public enum GATKReportVersion { + /** + * Differences between other versions: + * - Does not allow spaces in cells. + * - Mostly fixed width but has a bug where the string width of floating point + * values was not measured correctly leading to columns that aren't aligned + */ + V0_1("v0.1"), + + /** + * Differences between other versions: + * - Spaces allowed in cells, for example in sample names with spaces in them ex: "C507/FG-CR 6". + * - Fixed width fixed for floating point values + */ + V0_2("v0.2"); + + public final String versionString; + + private GATKReportVersion(String versionString) { + this.versionString = versionString; + } + + @Override + public String toString() { + return versionString; + } + + /** + * Returns the GATK Report Version from the file header. + * @param header Header from the file starting with ##:GATKReport.v[version] + * @return The version as an enum. + */ + public static GATKReportVersion fromHeader(String header) { + if (header.startsWith("##:GATKReport.v0.1 ")) + return GATKReportVersion.V0_1; + + if (header.startsWith("##:GATKReport.v0.2 ")) + return GATKReportVersion.V0_2; + + throw new ReviewedStingException("Unknown GATK report version in header: " + header); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 59d79ebf2..85035ba93 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -224,12 +224,12 @@ public class VariantAnnotator extends RodWalker { if ( ! indelsOnly ) { for ( VariantContext annotatedVC : annotatedVCs ) - vcfWriter.add(annotatedVC, ref.getBase()); + vcfWriter.add(annotatedVC); } else { // check to see if the buffered context is different (in location) this context if ( indelBufferContext != null && ! VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),indelBufferContext.iterator().next()).equals(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),annotatedVCs.iterator().next())) ) { for ( VariantContext annotatedVC : indelBufferContext ) - vcfWriter.add(annotatedVC, ref.getBase()); + vcfWriter.add(annotatedVC); indelBufferContext = annotatedVCs; } else { indelBufferContext = annotatedVCs; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index a45d89b19..9a432d4bf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -135,7 +135,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { return 0; if (vc_input.isFiltered()) { - vcfWriter.add(vc_input, ref.getBase()); + vcfWriter.add(vc_input); return 1; } @@ -335,7 +335,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { } - vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes), ref.getBase()); + vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes)); return 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index e4b2dbfee..0ccba13a2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -170,20 +170,20 @@ public class ProduceBeagleInputWalker extends RodWalker { logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1)); if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) { if ( bootstrapVCFOutput != null ) { - bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER), ref.getBase() ); + bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER)); } bootstrapSetSize++; return true; } else { if ( bootstrapVCFOutput != null ) { - bootstrapVCFOutput.add(validation,ref.getBase()); + bootstrapVCFOutput.add(validation); } testSetSize++; return false; } } else { if ( validation != null && bootstrapVCFOutput != null ) { - bootstrapVCFOutput.add(validation,ref.getBase()); + bootstrapVCFOutput.add(validation); } return false; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java index 93f62f85d..d26bfeca4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java @@ -112,7 +112,7 @@ public class VariantsToBeagleUnphasedWalker extends RodWalker // if we are holding it back and we are writing a bootstrap VCF, write it out if ( makeMissing && bootstrapVCFOutput != null ) { - bootstrapVCFOutput.add(vc, ref.getBase()); + bootstrapVCFOutput.add(vc); } // regardless, all sites are written to the unphased genotypes file, marked as missing if appropriate diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java index 4a4f6f6af..4e3342609 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java @@ -235,7 +235,7 @@ public class DiffEngine { // now that we have a specific list of values we want to show, display them GATKReport report = new GATKReport(); final String tableName = "diffences"; - report.addTable(tableName, "Summarized differences between the master and test files.\nSee http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine for more information", false); + report.addTable(tableName, "Summarized differences between the master and test files. See http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine for more information", false); GATKReportTable table = report.getTable(tableName); table.addPrimaryKey("Difference", true); table.addColumn("NumberOfOccurrences", 0); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index cad3569e5..f3c6cd687 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -278,7 +278,7 @@ public class VariantFiltrationWalker extends RodWalker { else filteredVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), filters, vc.getAttributes()); - writer.add( filteredVC, context.getReferenceContext().getBase() ); + writer.add(filteredVC); } public Integer reduce(Integer value, Integer sum) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java index 22c3081a3..e5e78905f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java @@ -93,7 +93,7 @@ public class UGCalcLikelihoods extends LocusWalker public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { VariantContext call = UG_engine.calculateLikelihoods(tracker, refContext, rawContext); - return call == null ? null : new VariantCallContext(call, refContext.getBase(), true); + return call == null ? null : new VariantCallContext(call, true); } public Integer reduceInit() { return 0; } @@ -107,7 +107,7 @@ public class UGCalcLikelihoods extends LocusWalker return sum; try { - writer.add(value, value.refBase); + writer.add(value); } catch (IllegalArgumentException e) { throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java index f834240b1..d91f8d2e4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java @@ -115,7 +115,7 @@ public class UGCallVariants extends RodWalker { try { Map attrs = new HashMap(value.getAttributes()); VariantContextUtils.calculateChromosomeCounts(value, attrs, true); - writer.add(VariantContext.modifyAttributes(value, attrs), value.refBase); + writer.add(VariantContext.modifyAttributes(value, attrs)); } catch (IllegalArgumentException e) { throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index e8dca8c49..812511322 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -227,7 +227,7 @@ public class UnifiedGenotyper extends LocusWalker GLs) { @@ -300,7 +300,8 @@ public class UnifiedGenotyperEngine { genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, - null); + null, + refContext.getBase()); } // private method called by both UnifiedGenotyper and UGCallVariants entry points into the engine @@ -425,10 +426,10 @@ public class UnifiedGenotyperEngine { myAlleles.add(vc.getReference()); } VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc, - myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes); + myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes, refContext.getBase()); if ( annotationEngine != null ) { - // first off, we want to use the *unfiltered* and *unBAQed* context for the annotations + // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations ReadBackedPileup pileup = null; if (rawContext.hasExtendedEventPileup()) pileup = rawContext.getExtendedEventPileup(); @@ -439,9 +440,7 @@ public class UnifiedGenotyperEngine { vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall); } - VariantCallContext call = new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); - call.setRefBase(refContext.getBase()); - return call; + return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); } private int calculateEndPos(Set alleles, Allele refAllele, GenomeLoc loc) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java index 5896e784e..423c80112 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java @@ -36,7 +36,6 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; * Useful helper class to communicate the results of calculateGenotype to framework */ public class VariantCallContext extends VariantContext { - public byte refBase; // Was the site called confidently, either reference or variant? public boolean confidentlyCalled = false; @@ -55,16 +54,6 @@ public class VariantCallContext extends VariantContext { this.shouldEmit = shouldEmit; } - VariantCallContext(VariantContext vc, byte ref, boolean confidentlyCalledP) { - super(vc); - this.refBase = ref; - this.confidentlyCalled = confidentlyCalledP; - } - - public void setRefBase(byte ref) { - this.refBase = ref; - } - /* these methods are only implemented for GENOTYPE_GIVEN_ALLELES MODE */ //todo -- expand these methods to all modes diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java index 78ad42ce1..64cff2c0d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java @@ -1,2086 +1,2086 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.indels; - -import net.sf.samtools.*; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; -import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter; -import org.broadinstitute.sting.gatk.filters.Platform454Filter; -import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter; -import org.broadinstitute.sting.gatk.filters.PlatformUnitFilterHelper; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.features.refseq.Transcript; -import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec; -import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.gatk.walkers.ReadFilters; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.collections.CircularArray; -import org.broadinstitute.sting.utils.collections.PrimitivePair; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.interval.OverlappingIntervalIterator; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.io.*; -import java.util.*; - -/** - * This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing - * data. Two output formats supported are: BED format (minimal output, required), and extended output that includes read - * and mismtach statistics around the calls (tuned on with --verbose). The calls can be performed from a single/pooled sample, - * or from a matched pair of samples (with --somatic option). In the latter case, two input bam files must be specified, - * the order is important: indels are called from the second sample ("Tumor") and additionally annotated as germline - * if even a weak evidence for the same indel, not necessarily a confident call, exists in the first sample ("Normal"), or as somatic - * if first bam has coverage at the site but no indication for an indel. In the --somatic mode, BED output contains - * only somatic calls, while --verbose output contains all calls annotated with GERMLINE/SOMATIC keywords. - */ -@ReadFilters({Platform454Filter.class, MappingQualityZeroReadFilter.class, PlatformUnitFilter.class}) -public class SomaticIndelDetectorWalker extends ReadWalker { -// @Output -// PrintStream out; - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter vcf_writer = null; - - @Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true) - @Deprecated - java.io.File output_file; - - @Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print callability metrics output", required = false) - public PrintStream metricsWriter = null; - -// @Argument(fullName="vcf_format", shortName="vcf", doc="generate output file in VCF format", required=false) -// boolean FORMAT_VCF = false; - - @Hidden - @Argument(fullName = "genotype_intervals", shortName = "genotype", - doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false) - public String genotypeIntervalsFile = null; - - @Hidden - @Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false, - doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+ - "if the list turns out to be unsorted, it will throw an exception. "+ - "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+ - "to sort and keep it in memory (increases memory usage!).") - protected boolean GENOTYPE_NOT_SORTED = false; - - @Hidden - @Argument(fullName="unpaired", shortName="unpaired", - doc="Perform unpaired calls (no somatic status detection)", required=false) - boolean call_unpaired = false; - boolean call_somatic ; - - @Argument(fullName="verboseOutput", shortName="verbose", - doc="Verbose output file in text format", required=false) - java.io.File verboseOutput = null; - - @Argument(fullName="bedOutput", shortName="bed", - doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false) - java.io.File bedOutput = null; - - @Argument(fullName="minCoverage", shortName="minCoverage", - doc="indel calls will be made only at sites with coverage of minCoverage or more reads; with --somatic this value is applied to tumor sample", required=false) - int minCoverage = 6; - - @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage", - doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false) - int minNormalCoverage = 4; - - @Argument(fullName="minFraction", shortName="minFraction", - doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+ - " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false) - double minFraction = 0.3; - - @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction", - doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt all indel observations at the site exceeds this threshold", required=false) - double minConsensusFraction = 0.7; - - @Argument(fullName="minIndelCount", shortName="minCnt", - doc="Minimum count of reads supporting consensus indel required for making the call. "+ - " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+ - "(minIndelCount not met) will not pass.", required=false) - int minIndelCount = 0; - - @Argument(fullName="refseq", shortName="refseq", - doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=false) - String RefseqFileName = null; - - @Argument(fullName="blacklistedLanes", shortName="BL", - doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ - "by this application, so they will not contribute indels to consider and will not be counted.", required=false) - PlatformUnitFilterHelper dummy; - @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",required=false) Boolean DEBUG = false; - @Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+ - "May need to be increased to accomodate longer reads or longer deletions.",required=false) int WINDOW_SIZE = 200; - @Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+ - " the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000; - - private WindowContext tumor_context; - private WindowContext normal_context; - private int currentContigIndex = -1; - private int contigLength = -1; // we see to much messy data with reads hanging out of contig ends... - private int currentPosition = -1; // position of the last read we've seen on the current contig - private String refName = null; - private java.io.Writer output = null; - private GenomeLoc location = null; - private long normalCallsMade = 0L, tumorCallsMade = 0L; - - boolean outOfContigUserWarned = false; - - private LocationAwareSeekableRODIterator refseqIterator=null; - -// private Set normalReadGroups; // we are going to remember which read groups are normals and which are tumors in order to be able -// private Set tumorReadGroups ; // to properly assign the reads coming from a merged stream - private Set normalSamples; // we are going to remember which samples are normal and which are tumor: - private Set tumorSamples ; // these are used only to generate genotypes for vcf output - - private int NQS_WIDTH = 5; // 5 bases on each side of the indel for NQS-style statistics - - private Writer bedWriter = null; - private Writer verboseWriter = null; - - - private static String annGenomic = "GENOMIC"; - private static String annIntron = "INTRON"; - private static String annUTR = "UTR"; - private static String annCoding = "CODING"; - private static String annUnknown = "UNKNOWN"; - - enum CallType { - NOCOVERAGE, - BADCOVERAGE, - NOEVIDENCE, - GERMLINE, - SOMATIC - }; - - private SAMRecord lastRead; - private byte[] refBases; - private ReferenceDataSource refData; - private Iterator genotypeIntervalIterator = null; - - // the current interval in the list of intervals, for which we want to do full genotyping - private GenomeLoc currentGenotypeInterval = null; - private long lastGenotypedPosition = -1; // last position on the currentGenotypeInterval, for which a call was already printed; - // can be 1 base before lastGenotyped start - - - // "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt" - - private Set getVCFHeaderInfo() { - Set headerInfo = new HashSet(); - - // first, the basic info - headerInfo.add(new VCFHeaderLine("source", "IndelGenotyperV2")); - headerInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - - // FORMAT and INFO fields -// headerInfo.addAll(VCFUtils.getSupportedHeaderStrings()); - - headerInfo.addAll(VCFIndelAttributes.getAttributeHeaderLines()); - if ( call_somatic ) { - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); - } else { - } - - // all of the arguments from the argument collection - Set args = new HashSet(); - args.add(this); - args.addAll(getToolkit().getFilters()); - Map commandLineArgs = getToolkit().getApproximateCommandLineArguments(args); - for ( Map.Entry commandLineArg : commandLineArgs.entrySet() ) - headerInfo.add(new VCFHeaderLine(String.format("IGv2_%s", commandLineArg.getKey()), commandLineArg.getValue())); - // also, the list of input bams - for ( String fileName : getToolkit().getArguments().samFiles ) - headerInfo.add(new VCFHeaderLine("IGv2_bam_file_used", fileName)); - - return headerInfo; - } - - - @Override - public void initialize() { - - call_somatic = (call_unpaired ? false : true); - normal_context = new WindowContext(0,WINDOW_SIZE); - normalSamples = new HashSet(); - - if ( bedOutput != null && output_file != null ) { - throw new UserException.DeprecatedArgument("-O", "-O option is deprecated and -bed option replaces it; you can not use both at the same time"); - } - - if ( RefseqFileName != null ) { - logger.info("Using RefSeq annotations from "+RefseqFileName); - - RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(), - getToolkit().getArguments().unsafe); - RMDTrack refseq = builder.createInstanceOfTrack(RefSeqCodec.class,new File(RefseqFileName)); - - refseqIterator = new SeekableRODIterator(refseq.getHeader(), - refseq.getSequenceDictionary(), - getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(), - refseq.getIterator()); - } - - if ( refseqIterator == null ) logger.info("No gene annotations available"); - - int nSams = getToolkit().getArguments().samFiles.size(); - - if ( call_somatic ) { - if ( nSams < 2 ) throw new UserException.BadInput("In default (paired sample) mode at least two bam files (normal and tumor) must be specified"); - tumor_context = new WindowContext(0,WINDOW_SIZE); - tumorSamples = new HashSet(); - } - - int nNorm = 0; - int nTum = 0; - for ( SAMReaderID rid : getToolkit().getReadsDataSource().getReaderIDs() ) { - Tags tags = rid.getTags() ; - if ( tags.getPositionalTags().isEmpty() && call_somatic ) - throw new UserException.BadInput("In default (paired sample) mode all input bam files must be tagged as either 'normal' or 'tumor'. Untagged file: "+ - getToolkit().getSourceFileForReaderID(rid)); - boolean normal = false; - boolean tumor = false; - for ( String s : tags.getPositionalTags() ) { // we allow additional unrelated tags (and we do not use them), but we REQUIRE one of Tumor/Normal to be present if --somatic is on - if ( "NORMAL".equals(s.toUpperCase()) ) { - normal = true; - nNorm++; - } - if ( "TUMOR".equals(s.toUpperCase()) ) { - tumor = true; - nTum++ ; - } - } - if ( call_somatic && normal && tumor ) throw new UserException.BadInput("Input bam file "+ - getToolkit().getSourceFileForReaderID(rid)+" is tagged both as normal and as tumor. Which one is it??"); - if ( call_somatic && !normal && ! tumor ) - throw new UserException.BadInput("In somatic mode all input bams must be tagged as either normal or tumor. Encountered untagged file: "+ - getToolkit().getSourceFileForReaderID(rid)); - if ( ! call_somatic && (normal || tumor) ) - System.out.println("WARNING: input bam file "+getToolkit().getSourceFileForReaderID(rid) - +" is tagged as Normal and/or Tumor, but somatic mode is not on. Tags will ne IGNORED"); - if ( call_somatic && tumor ) { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) { - tumorSamples.add(rg.getSample()); - } - } else { - for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) { - normalSamples.add(rg.getSample()); - } - } - if ( genotypeIntervalsFile != null ) { - - if ( ! GENOTYPE_NOT_SORTED && IntervalUtils.isIntervalFile(genotypeIntervalsFile)) { - // prepare to read intervals one-by-one, as needed (assuming they are sorted). - genotypeIntervalIterator = new IntervalFileMergingIterator(getToolkit().getGenomeLocParser(), - new java.io.File(genotypeIntervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); - } else { - // read in the whole list of intervals for cleaning - GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(), - IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(genotypeIntervalsFile),true), IntervalMergingRule.OVERLAPPING_ONLY); - genotypeIntervalIterator = locs.iterator(); - } - - // wrap intervals requested for genotyping inside overlapping iterator, so that we actually - // genotype only on the intersections of the requested intervals with the -L intervals - genotypeIntervalIterator = new OverlappingIntervalIterator(genotypeIntervalIterator, getToolkit().getIntervals().iterator() ); - - currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; - - if ( DEBUG) System.out.println("DEBUG>> first genotyping interval="+currentGenotypeInterval); - - if ( currentGenotypeInterval != null ) lastGenotypedPosition = currentGenotypeInterval.getStart()-1; - } - - } - - location = getToolkit().getGenomeLocParser().createGenomeLoc(getToolkit().getSAMFileHeader().getSequence(0).getSequenceName(),1); - - normalSamples = getToolkit().getSamplesByReaders().get(0); - - try { - // we already checked that bedOutput and output_file are not set simultaneously - if ( bedOutput != null ) bedWriter = new FileWriter(bedOutput); - if ( output_file != null ) bedWriter = new FileWriter(output_file); - } catch (java.io.IOException e) { - throw new UserException.CouldNotReadInputFile(bedOutput, "Failed to open BED file for writing.", e); - } - try { - if ( verboseOutput != null ) verboseWriter = new FileWriter(verboseOutput); - } catch (java.io.IOException e) { - throw new UserException.CouldNotReadInputFile(verboseOutput, "Failed to open BED file for writing.", e); - } - - vcf_writer.writeHeader(new VCFHeader(getVCFHeaderInfo(), SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()))) ; - refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile); - } - - - @Override - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - - // if ( read.getReadName().equals("428EFAAXX090610:2:36:1384:639#0") ) System.out.println("GOT READ"); - - if ( DEBUG ) { - // System.out.println("DEBUG>> read at "+ read.getAlignmentStart()+"-"+read.getAlignmentEnd()+ - // "("+read.getCigarString()+")"); - if ( read.getDuplicateReadFlag() ) System.out.println("DEBUG>> Duplicated read (IGNORED)"); - } - - if ( AlignmentUtils.isReadUnmapped(read) || - read.getDuplicateReadFlag() || - read.getNotPrimaryAlignmentFlag() || - read.getMappingQuality() == 0 ) { - return 0; // we do not need those reads! - } - - if ( read.getReferenceIndex() != currentContigIndex ) { - // we just jumped onto a new contig - if ( DEBUG ) System.out.println("DEBUG>>> Moved to contig "+read.getReferenceName()); - if ( read.getReferenceIndex() < currentContigIndex ) // paranoidal - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, read, "Read "+read.getReadName()+": contig is out of order; input BAM file is unsorted"); - - // print remaining indels from the previous contig (if any); - if ( call_somatic ) emit_somatic(1000000000, true); - else emit(1000000000,true); - - currentContigIndex = read.getReferenceIndex(); - currentPosition = read.getAlignmentStart(); - refName = new String(read.getReferenceName()); - - location = getToolkit().getGenomeLocParser().createGenomeLoc(refName,location.getStart(),location.getStop()); - contigLength = getToolkit().getGenomeLocParser().getContigInfo(refName).getSequenceLength(); - outOfContigUserWarned = false; - - lastGenotypedPosition = -1; - - normal_context.clear(); // reset coverage window; this will also set reference position to 0 - if ( call_somatic) tumor_context.clear(); - - refBases = new String(refData.getReference().getSequence(read.getReferenceName()).getBases()).toUpperCase().getBytes(); - } - - // we have reset the window to the new contig if it was required and emitted everything we collected - // on a previous contig. At this point we are guaranteed that we are set up properly for working - // with the contig of the current read. - - // NOTE: all the sanity checks and error messages below use normal_context only. We make sure that normal_context and - // tumor_context are synchronized exactly (windows are always shifted together by emit_somatic), so it's safe - - if ( read.getAlignmentStart() < currentPosition ) // oops, read out of order? - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, read, "Read "+read.getReadName() +" out of order on the contig\n"+ - "Read starts at "+refName+":"+read.getAlignmentStart()+"; last read seen started at "+refName+":"+currentPosition - +"\nLast read was: "+lastRead.getReadName()+" RG="+lastRead.getAttribute("RG")+" at "+lastRead.getAlignmentStart()+"-" - +lastRead.getAlignmentEnd()+" cigar="+lastRead.getCigarString()); - - currentPosition = read.getAlignmentStart(); - lastRead = read; - - if ( read.getAlignmentEnd() > contigLength ) { - if ( ! outOfContigUserWarned ) { - System.out.println("WARNING: Reads aligned past contig length on "+ location.getContig()+"; all such reads will be skipped"); - outOfContigUserWarned = true; - } - return 0; - } - - long alignmentEnd = read.getAlignmentEnd(); - Cigar c = read.getCigar(); - int lastNonClippedElement = 0; // reverse offset to the last unclipped element - CigarOperator op = null; - // moving backwards from the end of the cigar, skip trailing S or H cigar elements: - do { - lastNonClippedElement++; - op = c.getCigarElement( c.numCigarElements()-lastNonClippedElement ).getOperator(); - } while ( op == CigarOperator.H || op == CigarOperator.S ); - - // now op is the last non-S/H operator in the cigar. - - // a little trick here: we want to make sure that current read completely fits into the current - // window so that we can accumulate indel observations over the whole length of the read. - // The ::getAlignmentEnd() method returns the last position on the reference where bases from the - // read actually match (M cigar elements). After our cleaning procedure, we can have reads that end - // with I element, which is not gonna be counted into alignment length on the reference. On the other hand, - // in this program we assign insertions, internally, to the first base *after* the insertion position. - // Hence, we have to make sure that that extra base is already in the window or we will get IndexOutOfBounds. - - if ( op == CigarOperator.I) alignmentEnd++; - - if ( alignmentEnd > normal_context.getStop()) { - - // we don't emit anything until we reach a read that does not fit into the current window. - // At that point we try shifting the window to the start of that read (or reasonably close) and emit everything prior to - // that position. This is legitimate, since the reads are sorted and we are not gonna see any more coverage at positions - // below the current read's start. - // Clearly, we assume here that window is large enough to accomodate any single read, so simply shifting - // the window to around the read's start will ensure that the read fits... - - if ( DEBUG) System.out.println("DEBUG>> Window at "+normal_context.getStart()+"-"+normal_context.getStop()+", read at "+ - read.getAlignmentStart()+": trying to emit and shift" ); - if ( call_somatic ) emit_somatic( read.getAlignmentStart(), false ); - else emit( read.getAlignmentStart(), false ); - - // let's double check now that the read fits after the shift - if ( read.getAlignmentEnd() > normal_context.getStop()) { - // ooops, looks like the read does not fit into the window even after the latter was shifted!! - throw new UserException.BadArgumentValue("window_size", "Read "+read.getReadName()+": out of coverage window bounds. Probably window is too small, so increase the value of the window_size argument.\n"+ - "Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+ - read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+ - "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop()); - } - } - - if ( call_somatic ) { - - Tags tags = getToolkit().getReaderIDForRead(read).getTags(); - boolean assigned = false; - for ( String s : tags.getPositionalTags() ) { - if ( "NORMAL".equals(s.toUpperCase()) ) { - normal_context.add(read,ref.getBases()); - assigned = true; - break; - } - if ( "TUMOR".equals(s.toUpperCase()) ) { - tumor_context.add(read,ref.getBases()); - assigned = true; - break; - } - } - if ( ! assigned ) - throw new StingException("Read "+read.getReadName()+" from "+getToolkit().getSourceFileForReaderID(getToolkit().getReaderIDForRead(read))+ - "has no Normal/Tumor tag associated with it"); - -// String rg = (String)read.getAttribute("RG"); -// if ( rg == null ) -// throw new UserException.MalformedBam(read, "Read "+read.getReadName()+" has no read group in merged stream. RG is required for somatic calls."); - -// if ( normalReadGroups.contains(rg) ) { -// normal_context.add(read,ref.getBases()); -// } else if ( tumorReadGroups.contains(rg) ) { -// tumor_context.add(read,ref.getBases()); -// } else { -// throw new UserException.MalformedBam(read, "Unrecognized read group in merged stream: "+rg); -// } - - if ( tumor_context.getReads().size() > MAX_READ_NUMBER ) { - System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ - refName+':'+tumor_context.getStart()+'-'+tumor_context.getStop()+" in tumor sample. The whole window will be dropped."); - tumor_context.shift(WINDOW_SIZE); - normal_context.shift(WINDOW_SIZE); - } - if ( normal_context.getReads().size() > MAX_READ_NUMBER ) { - System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ - refName+':'+normal_context.getStart()+'-'+normal_context.getStop()+" in normal sample. The whole window will be dropped"); - tumor_context.shift(WINDOW_SIZE); - normal_context.shift(WINDOW_SIZE); - } - - - } else { - normal_context.add(read, ref.getBases()); - if ( normal_context.getReads().size() > MAX_READ_NUMBER ) { - System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ - refName+':'+normal_context.getStart()+'-'+normal_context.getStop()+". The whole window will be dropped"); - normal_context.shift(WINDOW_SIZE); - } - } - - return 1; - } - - /** An auxiliary shortcut: returns true if position(location.getContig(), p) is past l */ - private boolean pastInterval(long p, GenomeLoc l) { - return ( location.getContigIndex() > l.getContigIndex() || - location.getContigIndex() == l.getContigIndex() && p > l.getStop() ); - } - - /** Emit calls of the specified type across genotyping intervals, from position lastGenotypedPosition+1 to - * pos-1, inclusive. - * @param contigIndex - * @param pos - * @param call - */ - /* - private void emitNoCallsUpTo(int contigIndex, long pos, CallType call) { - - if ( contigIndex < currentGenotypeInterval.getContigIndex() || - contigIndex == currentGenotypeInterval.getContigIndex() && pos <= currentGenotypeInterval.getStart() ) return; - - if ( contigIndex == currentGenotypeInterval.getContigIndex() && pos >= currentGenotypeInterval.getStart() ) { - for ( long p = lastGenotypedPosition+1; p < pos; p++ ) { - - } - } - while( currentGenotypeInterval != null ) { - - while ( ) - if ( genotypeIntervalIterator.hasNext() ) { - currentGenotypeInterval = genotypeIntervalIterator.next() ; - if ( pastInterval(p,currentGenotypeInterval) ) { - // if we are about to jump over the whole next interval, we need to emit NO_COVERAGE calls there! - emitNoCoverageCalls(currentGenotypeInterval); - } - } else { - currentGenotypeInterval = null; - } - } - } -*/ - - /** Output indel calls up to the specified position and shift the window: after this method is executed, the - * first element of the window maps onto 'position', if possible, or at worst a few bases to the left of 'position' if we may need more - * reads to get full NQS-style statistics for an indel in the close proximity of 'position'. - * - * @param position - */ - private void emit(long position, boolean force) { - - long adjustedPosition = adjustPosition(position); - - if ( adjustedPosition == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(position-normal_context.getStart())); - return; - } - long move_to = adjustedPosition; - - for ( int pos = normal_context.getStart() ; pos < Math.min(adjustedPosition,normal_context.getStop()+1) ; pos++ ) { - - boolean genotype = false; - // first let's see if we need to genotype current position: - - final long p = pos - 1; // our internally used positions (pos) are +1 compared to external format spec (e.g. vcf) - - if ( pos <= lastGenotypedPosition ) continue; - - while ( currentGenotypeInterval != null ) { - - // if we did not even reach next interval yet, no genotyping at current position: - if ( location.getContigIndex() < currentGenotypeInterval.getContigIndex() || - location.getContigIndex() == currentGenotypeInterval.getContigIndex() && - p < currentGenotypeInterval.getStart() ) break; - if ( pastInterval(p, currentGenotypeInterval) ) { - // we are past current genotyping interval, so we are done with it; let's load next interval: - currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; - continue; // re-enter the loop to check against the interval we just loaded - } - - // we reach this point only if p is inside current genotyping interval; set the flag and bail out: - genotype = true; - break; - } - -// if ( DEBUG ) System.out.println("DEBUG>> pos="+pos +"; genotyping interval="+currentGenotypeInterval+"; genotype="+genotype); - - if ( normal_context.indelsAt(pos).size() == 0 && ! genotype ) continue; - - IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH); - - if ( normalCall.getCoverage() < minCoverage && ! genotype ) { - if ( DEBUG ) { - System.out.println("DEBUG>> Indel at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); - } - continue; // low coverage - } - - if ( DEBUG ) System.out.println("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" at "+pos); - - long left = Math.max( pos-NQS_WIDTH, normal_context.getStart() ); - long right = pos+( normalCall.getVariant() == null ? 0 : normalCall.getVariant().lengthOnRef())+NQS_WIDTH-1; - - if ( right >= adjustedPosition && ! force) { - // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect - - // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; - // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage - move_to = adjustPosition(left); - if ( move_to == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(adjustedPosition-normal_context.getStart())); - return; - } - if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ move_to); - break; - } - - // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right: - if ( right > normal_context.getStop() ) right = normal_context.getStop(); - - // location = getToolkit().getGenomeLocParser().setStart(location,pos); - // location = getToolkit().getGenomeLocParser().setStop(location,pos); // retrieve annotation data - - location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(), pos); - - boolean haveCall = normalCall.isCall(); // cache the value - - if ( haveCall || genotype) { - if ( haveCall ) normalCallsMade++; - printVCFLine(vcf_writer,normalCall); - if ( bedWriter != null ) normalCall.printBedLine(bedWriter); - if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall); - lastGenotypedPosition = pos; - } - - normal_context.indelsAt(pos).clear(); - // we dealt with this indel; don't want to see it again - // (we might otherwise in the case when 1) there is another indel that follows - // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - -// for ( IndelVariant var : variants ) { -// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount()); -// } - } - - if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")"); - normal_context.shift((int)(move_to - normal_context.getStart() ) ); - } - - /** A shortcut. Returns true if we got indels within the specified interval in single and only window context - * (for single-sample calls) or in either of the two window contexts (for two-sample/somatic calls) - * - */ - private boolean indelsPresentInInterval(long start, long stop) { - if ( tumor_context == null ) return normal_context.hasIndelsInInterval(start,stop); - return tumor_context.hasIndelsInInterval(start,stop) || - normal_context.hasIndelsInInterval(start,stop); - } - /** Takes the position, to which window shift is requested, and tries to adjust it in such a way that no NQS window is broken. - * Namely, this method checks, iteratively, if there is an indel within NQS_WIDTH bases ahead of initially requested or adjusted - * shift position. If there is such an indel, - * then shifting to that position would lose some or all NQS-window bases to the left of the indel (since it's not going to be emitted - * just yet). Instead, this method tries to readjust the shift position leftwards so that full NQS window to the left of the next indel - * is preserved. This method tries thie strategy 4 times (so that it would never walk away too far to the left), and if it fails to find - * an appropriate adjusted shift position (which could happen if there are many indels following each other at short intervals), it will give up, - * go back to the original requested shift position and try finding the first shift poisition that has no indel associated with it. - */ - - private long adjustPosition(long request) { - long initial_request = request; - int attempts = 0; - boolean failure = false; - while ( indelsPresentInInterval(request,request+NQS_WIDTH) ) { - request -= NQS_WIDTH; - if ( DEBUG ) System.out.println("DEBUG>> indel observations present within "+NQS_WIDTH+" bases ahead. Resetting shift to "+request); - attempts++; - if ( attempts == 4 ) { - if ( DEBUG ) System.out.println("DEBUG>> attempts to preserve full NQS window failed; now trying to find any suitable position.") ; - failure = true; - break; - } - } - - if ( failure ) { - // we tried 4 times but did not find a good shift position that would preserve full nqs window - // around all indels. let's fall back and find any shift position as long and there's no indel at the very - // first position after the shift (this is bad for other reasons); if it breaks a nqs window, so be it - request = initial_request; - attempts = 0; - while ( indelsPresentInInterval(request,request+1) ) { - request--; - if ( DEBUG ) System.out.println("DEBUG>> indel observations present within "+NQS_WIDTH+" bases ahead. Resetting shift to "+request); - attempts++; - if ( attempts == 50 ) { - System.out.println("WARNING: Indel at every position in the interval "+refName+":"+request+"-"+initial_request+ - ". Can not find a break to shift context window to; no calls will be attempted in the current window."); - return -1; - } - } - } - if ( DEBUG ) System.out.println("DEBUG>> Found acceptable target position "+request); - return request; - } - - /** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed - * first elements of the coverage arrays map onto 'position', or a few bases prior to the specified position - * if there is an indel in close proximity to 'position' so that we may get more coverage around it later. - * - * @param position - */ - private void emit_somatic(long position, boolean force) { - - long adjustedPosition = adjustPosition(position); - if ( adjustedPosition == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(position-normal_context.getStart())); - tumor_context.shift((int)(position-tumor_context.getStart())); - return; - } - long move_to = adjustedPosition; - - if ( DEBUG ) System.out.println("DEBUG>> Emitting in somatic mode up to "+position+" force shift="+force+" current window="+tumor_context.getStart()+"-"+tumor_context.getStop()); - - for ( int pos = tumor_context.getStart() ; pos < Math.min(adjustedPosition,tumor_context.getStop()+1) ; pos++ ) { - - boolean genotype = false; - // first let's see if we need to genotype current position: - - final long p = pos - 1; // our internally used positions (pos) are +1 compared to external format spec (e.g. vcf) - - if ( pos <= lastGenotypedPosition ) continue; - - while ( currentGenotypeInterval != null ) { - - // if we did not even reach next interval yet, no genotyping at current position: - if ( location.getContigIndex() < currentGenotypeInterval.getContigIndex() || - location.getContigIndex() == currentGenotypeInterval.getContigIndex() && - p < currentGenotypeInterval.getStart() ) break; - if ( pastInterval(p, currentGenotypeInterval) ) { - // we are past current genotyping interval, so we are done with it; let's load next interval: - currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; - continue; // re-enter the loop to check against the interval we just loaded - } - - // we reach tjis point only if p is inside current genotyping interval; set the flag and bail out: - genotype = true; - break; - } -// if ( DEBUG) System.out.println("DEBUG>> pos="+pos +"; genotyping interval="+currentGenotypeInterval+"; genotype="+genotype); - - if ( tumor_context.indelsAt(pos).size() == 0 && ! genotype ) continue; // no indels in tumor - - if ( DEBUG && genotype ) System.out.println("DEBUG>> Genotyping requested at "+pos); - - IndelPrecall tumorCall = new IndelPrecall(tumor_context,pos,NQS_WIDTH); - IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH); - - if ( tumorCall.getCoverage() < minCoverage && ! genotype ) { - if ( DEBUG ) { - System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumorCall.getCoverage()+" (SKIPPED)"); - } - continue; // low coverage - } - if ( normalCall.getCoverage() < minNormalCoverage && ! genotype ) { - if ( DEBUG ) { - System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); - } - continue; // low coverage - } - - if ( DEBUG ) { - System.out.print("DEBUG>> "+(tumorCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in tumor, "); - System.out.print("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in normal at "+pos); - } - - long left = Math.max( pos-NQS_WIDTH, tumor_context.getStart() ); - long right = pos+ ( tumorCall.getVariant() == null ? 0 : tumorCall.getVariant().lengthOnRef() )+NQS_WIDTH-1; - - if ( right >= adjustedPosition && ! force) { - // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect - - // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; - // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage - move_to = adjustPosition(left); - if ( move_to == -1 ) { - // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether - normal_context.shift((int)(adjustedPosition-normal_context.getStart())); - tumor_context.shift((int)(adjustedPosition-tumor_context.getStart())); - return; - } - if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ move_to); - break; - } - - if ( right > tumor_context.getStop() ) right = tumor_context.getStop(); // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right - -// location = getToolkit().getGenomeLocParser().setStart(location,pos); -// location = getToolkit().getGenomeLocParser().setStop(location,pos); // retrieve annotation data - - location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(),pos); // retrieve annotation data - - boolean haveCall = tumorCall.isCall(); // cache the value - - if ( haveCall || genotype ) { - if ( haveCall ) tumorCallsMade++; - - printVCFLine(vcf_writer,normalCall,tumorCall); - - if ( bedWriter != null ) tumorCall.printBedLine(bedWriter); - - if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, tumorCall ); - lastGenotypedPosition = pos; - } - tumor_context.indelsAt(pos).clear(); - normal_context.indelsAt(pos).clear(); - // we dealt with this indel; don't want to see it again - // (we might otherwise in the case when 1) there is another indel that follows - // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) - -// for ( IndelVariant var : variants ) { -// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount()); -// } - } - - if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")"); - tumor_context.shift((int)(move_to - tumor_context.getStart() ) ); - normal_context.shift((int)(move_to - normal_context.getStart() ) ); - } - - private String makeFullRecord(IndelPrecall normalCall, IndelPrecall tumorCall) { - StringBuilder fullRecord = new StringBuilder(); - if ( tumorCall.getVariant() != null || normalCall.getVariant() == null) { - fullRecord.append(tumorCall.makeEventString()); - } else { - fullRecord.append(normalCall.makeEventString()); - } - fullRecord.append('\t'); - fullRecord.append(normalCall.makeStatsString("N_")); - fullRecord.append('\t'); - fullRecord.append(tumorCall.makeStatsString("T_")); - fullRecord.append('\t'); - return fullRecord.toString(); - } - - private String makeFullRecord(IndelPrecall normalCall) { - StringBuilder fullRecord = new StringBuilder(); - fullRecord.append(normalCall.makeEventString()); - fullRecord.append('\t'); - fullRecord.append(normalCall.makeStatsString("")); - fullRecord.append('\t'); - return fullRecord.toString(); - } - - private String getAnnotationString(RODRecordList ann) { - if ( ann == null ) return annGenomic; - else { - StringBuilder b = new StringBuilder(); - - if ( RefSeqFeature.isExon(ann) ) { - if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence - else b.append(annUTR); // exon but not coding = UTR - } else { - if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron - else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon... - } - b.append('\t'); - b.append(((Transcript)ann.get(0).getUnderlyingObject()).getGeneName()); // there is at least one transcript in the list, guaranteed -// while ( it.hasNext() ) { // -// t.getGeneName() -// } - return b.toString(); - } - - } - - public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall) { - RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); - String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); - - StringBuilder fullRecord = new StringBuilder(); - fullRecord.append(makeFullRecord(normalCall)); - fullRecord.append(annotationString); - if ( ! normalCall.isCall() && normalCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); - try { - verboseWriter.write(fullRecord.toString()); - verboseWriter.write('\n'); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(verboseOutput, "Write failed", e); - } - - } - - - public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, IndelPrecall tumorCall) { - RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); - String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); - - StringBuilder fullRecord = new StringBuilder(); - fullRecord.append(makeFullRecord(normalCall,tumorCall)); - - if ( normalCall.getVariant() == null && tumorCall.getVariant() == null ) { - // did not observe anything - if ( normalCall.getCoverage() >= minNormalCoverage && tumorCall.getCoverage() >= minCoverage ) fullRecord.append("REFERENCE"); - else { - if ( tumorCall.getCoverage() >= minCoverage ) fullRecord.append("REFERENCE"); // no coverage in normal but nothing in tumor - else { - // no coverage in tumor; if we have no coverage in normal, it can be anything; if we do have coverage in normal, - // this still could be a somatic event. so either way it is 'unknown' - fullRecord.append("UNKNOWN"); - } - } - - } - - if ( normalCall.getVariant() == null && tumorCall.getVariant() != null ) { - // looks like somatic call - if ( normalCall.getCoverage() >= minNormalCoverage ) fullRecord.append("SOMATIC"); // we confirm there is nothing in normal - else { - // low coverage in normal - fullRecord.append("EVENT_T"); // no coverage in normal, no idea whether it is germline or somatic - } - } - - if ( normalCall.getVariant() != null && tumorCall.getVariant() == null ) { - // it's likely germline (with missing observation in tumor - maybe loh? - if ( tumorCall.getCoverage() >= minCoverage ) fullRecord.append("GERMLINE_LOH"); // we confirm there is nothing in tumor - else { - // low coverage in tumor, maybe we missed the event - fullRecord.append("GERMLINE"); // no coverage in tumor but we already saw it in normal... - } - } - - if ( normalCall.getVariant() != null && tumorCall.getVariant() != null ) { - // events in both T/N, got to be germline! - fullRecord.append("GERMLINE"); - } - - - fullRecord.append('\t'); - fullRecord.append(annotationString); - - if ( ! tumorCall.isCall() && tumorCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); - - try { - verboseWriter.write(fullRecord.toString()); - verboseWriter.write('\n'); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(verboseOutput, "Write failed", e); - } - } - - public void printVCFLine(VCFWriter vcf, IndelPrecall call) { - - long start = call.getPosition()-1; - // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed. - // The suggestion is instead of putting the base before the indel, to put the base after the indel. - // For now, just don't print out that site. - if ( start == 0 ) - return; - - long stop = start; - - List alleles = new ArrayList(2); // actual observed (distinct!) alleles at the site - List homref_alleles = null; // when needed, will contain two identical copies of ref allele - needed to generate hom-ref genotype - - - if ( call.getVariant() == null ) { - // we will need to cteate genotype with two (hom) ref alleles (below). - // we can not use 'alleles' list here, since that list is supposed to contain - // only *distinct* alleles observed at the site or VCFContext will frown upon us... - alleles.add( Allele.create(refBases[(int)start-1],true) ); - homref_alleles = new ArrayList(2); - homref_alleles.add( alleles.get(0)); - homref_alleles.add( alleles.get(0)); - } else { - // we always create alt allele when we observe anything but the ref, even if it is not a call! - // (Genotype will tell us whether it is an actual call or not!) - int event_length = call.getVariant().lengthOnRef(); - if ( event_length < 0 ) event_length = 0; - fillAlleleList(alleles,call); - stop += event_length; - } - - Map genotypes = new HashMap(); - - for ( String sample : normalSamples ) { - - Map attrs = call.makeStatsAttributes(null); - - if ( call.isCall() ) // we made a call - put actual het genotype here: - genotypes.put(sample,new Genotype(sample,alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false)); - else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all) - genotypes.put(sample,new Genotype(sample, homref_alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false)); - - } - Set filters = null; - if ( call.getVariant() != null && ! call.isCall() ) { - filters = new HashSet(); - filters.add("NoCall"); - } - VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes, - -1.0 /* log error */, filters, null); - vcf.add(vc,refBases[(int)start-1]); - } - - /** Fills l with appropriate alleles depending on whether call is insertion or deletion - * (l MUST have a variant or this method will crash). It is guaranteed that the *first* allele added - * to the list is ref, and the next one is alt. - * @param l - * @param call - */ - private void fillAlleleList(List l, IndelPrecall call) { - int event_length = call.getVariant().lengthOnRef(); - if ( event_length == 0 ) { // insertion - - l.add( Allele.create(Allele.NULL_ALLELE_STRING,true) ); - l.add( Allele.create(call.getVariant().getBases(), false )); - - } else { //deletion: - l.add( Allele.create(call.getVariant().getBases(), true )); - l.add( Allele.create(Allele.NULL_ALLELE_STRING,false) ); - } - } - - public void printVCFLine(VCFWriter vcf, IndelPrecall nCall, IndelPrecall tCall) { - - long start = tCall.getPosition()-1; - long stop = start; - - // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed. - // The suggestion is instead of putting the base before the indel, to put the base after the indel. - // For now, just don't print out that site. - if ( start == 0 ) - return; - - Map attrsNormal = nCall.makeStatsAttributes(null); - Map attrsTumor = tCall.makeStatsAttributes(null); - - Map attrs = new HashMap(); - - boolean isSomatic = false; - if ( nCall.getCoverage() >= minNormalCoverage && nCall.getVariant() == null && tCall.getVariant() != null ) { - isSomatic = true; - attrs.put(VCFConstants.SOMATIC_KEY,true); - } - List alleles = new ArrayList(2); // all alleles at the site - // List normal_alleles = null; // all alleles at the site - List homRefAlleles = null; - -// if ( nCall.getVariant() == null || tCall.getVariant() == null ) { - homRefAlleles = new ArrayList(2) ; // we need this for somatic calls (since normal is ref-ref), and also for no-calls -// } - boolean homRefT = ( tCall.getVariant() == null ); - boolean homRefN = ( nCall.getVariant() == null ); - if ( tCall.getVariant() == null && nCall.getVariant() == null) { - // no indel at all ; create base-representation ref/ref alleles for genotype construction - alleles.add( Allele.create(refBases[(int)start-1],true) ); - } else { - // we got indel(s) - int event_length = 0; - if ( tCall.getVariant() != null ) { - // indel in tumor - event_length = tCall.getVariant().lengthOnRef(); - fillAlleleList(alleles, tCall); - } else { - event_length = nCall.getVariant().lengthOnRef(); - fillAlleleList(alleles, nCall); - } - if ( event_length > 0 ) stop += event_length; - } - homRefAlleles.add( alleles.get(0)); - homRefAlleles.add( alleles.get(0)); - - Map genotypes = new HashMap(); - - for ( String sample : normalSamples ) { - genotypes.put(sample,new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsNormal,false)); - } - - for ( String sample : tumorSamples ) { - genotypes.put(sample,new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsTumor,false) ); - } - - Set filters = null; - if ( tCall.getVariant() != null && ! tCall.isCall() ) { - filters = new HashSet(); - filters.add("NoCall"); - } - if ( nCall.getCoverage() < minNormalCoverage ) { - if ( filters == null ) filters = new HashSet(); - filters.add("NCov"); - } - if ( tCall.getCoverage() < minCoverage ) { - if ( filters == null ) filters = new HashSet(); - filters.add("TCov"); - } - - VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes, - -1.0 /* log error */, filters, attrs); - vcf.add(vc,refBases[(int)start-1]); - } - - @Override - public void onTraversalDone(Integer result) { - if ( DEBUG ) { - System.out.println("DEBUG>> Emitting last window at "+normal_context.getStart()+"-"+normal_context.getStop()); - } - if ( call_somatic ) emit_somatic(1000000000, true); - else emit(1000000000,true); // emit everything we might have left - - if ( metricsWriter != null ) { - metricsWriter.println(String.format("Normal calls made %d", normalCallsMade)); - metricsWriter.println(String.format("Tumor calls made %d", tumorCallsMade)); - metricsWriter.close(); - } - - try { - if ( bedWriter != null ) bedWriter.close(); - if ( verboseWriter != null ) verboseWriter.close(); - } catch (IOException e) { - System.out.println("Failed to close output BED file gracefully, data may be lost"); - e.printStackTrace(); - } - super.onTraversalDone(result); - } - - @Override - public Integer reduce(Integer value, Integer sum) { - if ( value == -1 ) { - onTraversalDone(sum); - System.exit(1); - } - sum += value; - return sum; - } - - @Override - public Integer reduceInit() { - return new Integer(0); - } - - - static class IndelVariant { - public static enum Type { I, D}; - private String bases; - private Type type; - private ArrayList fromStartOffsets = null; - private ArrayList fromEndOffsets = null; - - private Set reads = new HashSet(); // keep track of reads that have this indel - private Set samples = new HashSet(); // which samples had the indel described by this object - - public IndelVariant(ExpandedSAMRecord read , Type type, String bases) { - this.type = type; - this.bases = bases.toUpperCase(); - addObservation(read); - fromStartOffsets = new ArrayList(); - fromEndOffsets = new ArrayList(); - } - - /** Adds another observation for the current indel. It is assumed that the read being registered - * does contain the observation, no checks are performed. Read's sample is added to the list of samples - * this indel was observed in as well. - * @param read - */ - public void addObservation(ExpandedSAMRecord read) { - if ( reads.contains(read) ) { - //TODO fix CleanedReadInjector and reinstate exception here: duplicate records may signal a problem with the bam - // seeing the same read again can mean only one thing: the input bam file is corrupted and contains - // duplicate records. We KNOW that this may happen for the time being due to bug in CleanedReadInjector - // so this is a short-term patch: don't cry, but just ignore the duplicate record - - //throw new StingException("Attempting to add indel observation that was already registered"); - return; - } - reads.add(read); - String sample = null; - if ( read.getSAMRecord().getReadGroup() != null ) sample = read.getSAMRecord().getReadGroup().getSample(); - if ( sample != null ) samples.add(sample); - } - - - /** Returns length of the event on the reference (number of deleted bases - * for deletions, -1 for insertions. - * @return - */ - public int lengthOnRef() { - if ( type == Type.D ) return bases.length(); - else return 0; - } - - - public void addSample(String sample) { - if ( sample != null ) - samples.add(sample); - } - - public void addReadPositions(int fromStart, int fromEnd) { - fromStartOffsets.add(fromStart); - fromEndOffsets.add(fromEnd); - } - - public List getOffsetsFromStart() { return fromStartOffsets ; } - public List getOffsetsFromEnd() { return fromEndOffsets; } - - public String getSamples() { - StringBuffer sb = new StringBuffer(); - Iterator i = samples.iterator(); - while ( i.hasNext() ) { - sb.append(i.next()); - if ( i.hasNext() ) - sb.append(","); - } - return sb.toString(); - } - - public Set getReadSet() { return reads; } - - public int getCount() { return reads.size(); } - - public String getBases() { return bases; } - - public Type getType() { return type; } - - @Override - public boolean equals(Object o) { - if ( ! ( o instanceof IndelVariant ) ) return false; - IndelVariant that = (IndelVariant)o; - return ( this.type == that.type && this.bases.equals(that.bases) ); - } - - public boolean equals(Type type, String bases) { - return ( this.type == type && this.bases.equals(bases.toUpperCase()) ); - } - } - - /** - * Utility class that encapsulates the logic related to collecting all the stats and counts required to - * make (or discard) a call, as well as the calling heuristics that uses those data. - */ - class IndelPrecall { -// private boolean DEBUG = false; - private int NQS_MISMATCH_CUTOFF = 1000000; - private double AV_MISMATCHES_PER_READ = 1.5; - - private int nqs = 0; - private IndelVariant consensus_indel = null; // indel we are going to call - private long pos = -1 ; // position on the ref - private int total_coverage = 0; // total number of reads overlapping with the event - private int consensus_indel_count = 0; // number of reads, in which consensus indel was observed - private int all_indel_count = 0 ; // number of reads, in which any indel was observed at current position - - private int total_mismatches_in_nqs_window = 0; // total number of mismatches in the nqs window around the indel - private int total_bases_in_nqs_window = 0; // total number of bases in the nqs window (some reads may not fully span the window so it's not coverage*nqs_size) - private int total_base_qual_in_nqs_window = 0; // sum of qualitites of all the bases in the nqs window - private int total_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of all mismatching bases in the nqs window - - private int indel_read_mismatches_in_nqs_window = 0; // mismatches inside the nqs window in indel-containing reads only - private int indel_read_bases_in_nqs_window = 0; // number of bases in the nqs window from indel-containing reads only - private int indel_read_base_qual_in_nqs_window = 0; // sum of qualitites of bases in nqs window from indel-containing reads only - private int indel_read_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of mismatching bases in the nqs window from indel-containing reads only - - - private int consensus_indel_read_mismatches_in_nqs_window = 0; // mismatches within the nqs window from consensus indel reads only - private int consensus_indel_read_bases_in_nqs_window = 0; // number of bases in the nqs window from consensus indel-containing reads only - private int consensus_indel_read_base_qual_in_nqs_window = 0; // sum of qualitites of bases in nqs window from consensus indel-containing reads only - private int consensus_indel_read_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of mismatching bases in the nqs window from consensus indel-containing reads only - - - private double consensus_indel_read_total_mm = 0.0; // sum of all mismatches in reads that contain consensus indel - private double all_indel_read_total_mm = 0.0; // sum of all mismatches in reads that contain any indel at given position - private double all_read_total_mm = 0.0; // sum of all mismatches in all reads - - private double consensus_indel_read_total_mapq = 0.0; // sum of mapping qualitites of all reads with consensus indel - private double all_indel_read_total_mapq = 0.0 ; // sum of mapping qualitites of all reads with (any) indel at current position - private double all_read_total_mapq = 0.0; // sum of all mapping qualities of all reads - - private PrimitivePair.Int consensus_indel_read_orientation_cnt = new PrimitivePair.Int(); - private PrimitivePair.Int all_indel_read_orientation_cnt = new PrimitivePair.Int(); - private PrimitivePair.Int all_read_orientation_cnt = new PrimitivePair.Int(); - - private int from_start_median = 0; - private int from_start_mad = 0; - private int from_end_median = 0; - private int from_end_mad = 0; - - /** Makes an empty call (no-call) with all stats set to 0 - * - * @param position - */ - public IndelPrecall(long position) { - this.pos = position; - } - - public IndelPrecall(WindowContext context, long position, int nqs_width) { - this.pos = position; - this.nqs = nqs_width; - total_coverage = context.coverageAt(pos,true); - List variants = context.indelsAt(pos); - findConsensus(variants); - - // pos is the first base after the event: first deleted base or first base after insertion. - // hence, [pos-nqs, pos+nqs-1] (inclusive) is the window with nqs bases on each side of a no-event or an insertion - // and [pos-nqs, pos+Ndeleted+nqs-1] is the window with nqs bases on each side of a deletion. - // we initialize the nqs window for no-event/insertion case - long left = Math.max( pos-nqs, context.getStart() ); - long right = Math.min(pos+nqs-1, context.getStop()); -//if ( pos == 3534096 ) System.out.println("pos="+pos +" total reads: "+context.getReads().size()); - Iterator read_iter = context.getReads().iterator(); - - - while ( read_iter.hasNext() ) { - ExpandedSAMRecord rec = read_iter.next(); - SAMRecord read = rec.getSAMRecord(); - byte[] flags = rec.getExpandedMMFlags(); - byte[] quals = rec.getExpandedQuals(); - int mm = rec.getMMCount(); - - - if( read.getAlignmentStart() > pos || read.getAlignmentEnd() < pos ) continue; - - long local_right = right; // end of nqs window for this particular read. May need to be advanced further right - // if read has a deletion. The gap in the middle of nqs window will be skipped - // automatically since flags/quals are set to -1 there - - boolean read_has_a_variant = false; - boolean read_has_consensus = ( consensus_indel!= null && consensus_indel.getReadSet().contains(rec) ); - for ( IndelVariant v : variants ) { - if ( v.getReadSet().contains(rec) ) { - read_has_a_variant = true; - local_right += v.lengthOnRef(); - break; - } - } - - if ( read_has_consensus ) { - consensus_indel_read_total_mm += mm; - consensus_indel_read_total_mapq += read.getMappingQuality(); - if ( read.getReadNegativeStrandFlag() ) consensus_indel_read_orientation_cnt.second++; - else consensus_indel_read_orientation_cnt.first++; - } - if ( read_has_a_variant ) { - all_indel_read_total_mm += mm; - all_indel_read_total_mapq += read.getMappingQuality(); - if ( read.getReadNegativeStrandFlag() ) all_indel_read_orientation_cnt.second++; - else all_indel_read_orientation_cnt.first++; - } - - all_read_total_mm+= mm; - all_read_total_mapq += read.getMappingQuality(); - if ( read.getReadNegativeStrandFlag() ) all_read_orientation_cnt.second++; - else all_read_orientation_cnt.first++; - - for ( int pos_in_flags = Math.max((int)(left - read.getAlignmentStart()),0); - pos_in_flags <= Math.min((int)local_right-read.getAlignmentStart(),flags.length - 1); - pos_in_flags++) { - - if ( flags[pos_in_flags] == -1 ) continue; // gap (deletion), skip it; we count only bases aligned to the ref - total_bases_in_nqs_window++; - if ( read_has_consensus ) consensus_indel_read_bases_in_nqs_window++; - if ( read_has_a_variant ) indel_read_bases_in_nqs_window++; - - if ( quals[pos_in_flags] != -1 ) { - - total_base_qual_in_nqs_window += quals[pos_in_flags]; - if ( read_has_a_variant ) indel_read_base_qual_in_nqs_window += quals[pos_in_flags]; - if ( read_has_consensus ) consensus_indel_read_base_qual_in_nqs_window += quals[pos_in_flags]; - } - - if ( flags[pos_in_flags] == 1 ) { // it's a mismatch - total_mismatches_in_nqs_window++; - total_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; - - if ( read_has_consensus ) { - consensus_indel_read_mismatches_in_nqs_window++; - consensus_indel_read_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; - } - - if ( read_has_a_variant ) { - indel_read_mismatches_in_nqs_window++; - indel_read_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; - } - } - } -// if ( pos == 3534096 ) { -// System.out.println(read.getReadName()); -// System.out.println(" cons nqs bases="+consensus_indel_read_bases_in_nqs_window); -// System.out.println(" qual sum="+consensus_indel_read_base_qual_in_nqs_window); -// } - - } - - // compute median/mad for offsets from the read starts/ends - if ( consensus_indel != null ) { - from_start_median = median(consensus_indel.getOffsetsFromStart()) ; - from_start_mad = mad(consensus_indel.getOffsetsFromStart(),from_start_median); - from_end_median = median(consensus_indel.getOffsetsFromEnd()) ; - from_end_mad = mad(consensus_indel.getOffsetsFromEnd(),from_end_median); - } - } - - /** As a side effect will sort l! - * - * @param l - * @return - */ - private int median(List l) { - Collections.sort(l); - int k = l.size()/2; - return ( l.size() % 2 == 0 ? - (l.get(k-1).intValue()+l.get(k).intValue())/2 : - l.get(k).intValue()); - } - - private int median(int[] l) { - Arrays.sort(l); - int k = l.length/2; - return ( l.length % 2 == 0 ? - (l[k-1]+l[k])/2 : - l[k]); - } - - private int mad(List l, int med) { - int [] diff = new int[l.size()]; - for ( int i = 0; i < l.size(); i++ ) { - diff[i] = Math.abs(l.get(i).intValue() - med); - } - return median(diff); - } - - public long getPosition() { return pos; } - - public boolean hasObservation() { return consensus_indel != null; } - - public int getCoverage() { return total_coverage; } - - public double getTotalMismatches() { return all_read_total_mm; } - public double getConsensusMismatches() { return consensus_indel_read_total_mm; } - public double getAllVariantMismatches() { return all_indel_read_total_mm; } - - /** Returns average number of mismatches per consensus indel-containing read */ - public double getAvConsensusMismatches() { - return ( consensus_indel_count != 0 ? consensus_indel_read_total_mm/consensus_indel_count : 0.0 ); - } - - /** Returns average number of mismatches per read across all reads matching the ref (not containing any indel variants) */ - public double getAvRefMismatches() { - int coverage_ref = total_coverage-all_indel_count; - return ( coverage_ref != 0 ? (all_read_total_mm - all_indel_read_total_mm )/coverage_ref : 0.0 ); - } - - public PrimitivePair.Int getConsensusStrandCounts() { - return consensus_indel_read_orientation_cnt; - } - - public PrimitivePair.Int getRefStrandCounts() { - return new PrimitivePair.Int(all_read_orientation_cnt.first-all_indel_read_orientation_cnt.first, - all_read_orientation_cnt.second - all_indel_read_orientation_cnt.second); - } - - /** Returns a sum of mapping qualities of all reads spanning the event. */ - public double getTotalMapq() { return all_read_total_mapq; } - - /** Returns a sum of mapping qualities of all reads, in which the consensus variant is observed. */ - public double getConsensusMapq() { return consensus_indel_read_total_mapq; } - - /** Returns a sum of mapping qualities of all reads, in which any variant is observed at the current event site. */ - public double getAllVariantMapq() { return all_indel_read_total_mapq; } - - /** Returns average mapping quality per consensus indel-containing read. */ - public double getAvConsensusMapq() { - return ( consensus_indel_count != 0 ? consensus_indel_read_total_mapq/consensus_indel_count : 0.0 ); - } - - /** Returns average number of mismatches per read across all reads matching the ref (not containing any indel variants). */ - public double getAvRefMapq() { - int coverage_ref = total_coverage-all_indel_count; - return ( coverage_ref != 0 ? (all_read_total_mapq - all_indel_read_total_mapq )/coverage_ref : 0.0 ); - } - - /** Returns fraction of bases in NQS window around the indel that are mismatches, across all reads, - * in which consensus indel is observed. NOTE: NQS window for indel containing reads is defined around - * the indel itself (e.g. for a 10-base deletion spanning [X,X+9], the 5-NQS window is {[X-5,X-1],[X+10,X+15]} - * */ - public double getNQSConsensusMMRate() { - if ( consensus_indel_read_bases_in_nqs_window == 0 ) return 0; - return ((double)consensus_indel_read_mismatches_in_nqs_window)/consensus_indel_read_bases_in_nqs_window; - } - - /** Returns fraction of bases in NQS window around the indel start position that are mismatches, across all reads - * that align to the ref (i.e. contain no indel observation at the current position). NOTE: NQS window for ref - * reads is defined around the event start position, NOT around the actual consensus indel. - * */ - public double getNQSRefMMRate() { - int num_ref_bases = total_bases_in_nqs_window - indel_read_bases_in_nqs_window; - if ( num_ref_bases == 0 ) return 0; - return ((double)(total_mismatches_in_nqs_window - indel_read_mismatches_in_nqs_window))/num_ref_bases; - } - - /** Returns average base quality in NQS window around the indel, across all reads, - * in which consensus indel is observed. NOTE: NQS window for indel containing reads is defined around - * the indel itself (e.g. for a 10-base deletion spanning [X,X+9], the 5-NQS window is {[X-5,X-1],[X+10,X+15]} - * */ - public double getNQSConsensusAvQual() { - if ( consensus_indel_read_bases_in_nqs_window == 0 ) return 0; - return ((double)consensus_indel_read_base_qual_in_nqs_window)/consensus_indel_read_bases_in_nqs_window; - } - - /** Returns fraction of bases in NQS window around the indel start position that are mismatches, across all reads - * that align to the ref (i.e. contain no indel observation at the current position). NOTE: NQS window for ref - * reads is defined around the event start position, NOT around the actual consensus indel. - * */ - public double getNQSRefAvQual() { - int num_ref_bases = total_bases_in_nqs_window - indel_read_bases_in_nqs_window; - if ( num_ref_bases == 0 ) return 0; - return ((double)(total_base_qual_in_nqs_window - indel_read_base_qual_in_nqs_window))/num_ref_bases; - } - - public int getTotalNQSMismatches() { return total_mismatches_in_nqs_window; } - - public int getAllVariantCount() { return all_indel_count; } - public int getConsensusVariantCount() { return consensus_indel_count; } - -// public boolean failsNQSMismatch() { -// //TODO wrong fraction: mismatches are counted only in indel-containing reads, but total_coverage is used! -// return ( indel_read_mismatches_in_nqs_window > NQS_MISMATCH_CUTOFF ) || -// ( indel_read_mismatches_in_nqs_window > total_coverage * AV_MISMATCHES_PER_READ ); -// } - - public IndelVariant getVariant() { return consensus_indel; } - - public boolean isCall() { - boolean ret = ( consensus_indel_count >= minIndelCount && - (double)consensus_indel_count > minFraction * total_coverage && - (double) consensus_indel_count > minConsensusFraction*all_indel_count && total_coverage >= minCoverage); - if ( DEBUG && ! ret ) System.out.println("DEBUG>> NOT a call: count="+consensus_indel_count+ - " total_count="+all_indel_count+" cov="+total_coverage+ - " minConsensusF="+((double)consensus_indel_count)/all_indel_count+ - " minF="+((double)consensus_indel_count)/total_coverage); - return ret; - - } - - /** Utility method: finds the indel variant with the largest count (ie consensus) among all the observed - * variants, and sets the counts of consensus observations and all observations of any indels (including non-consensus) - * @param variants - * @return - */ - private void findConsensus(List variants) { - for ( IndelVariant var : variants ) { - if ( DEBUG ) System.out.println("DEBUG>> Variant "+var.getBases()+" (cnt="+var.getCount()+")"); - int cnt = var.getCount(); - all_indel_count +=cnt; - if ( cnt > consensus_indel_count ) { - consensus_indel = var; - consensus_indel_count = cnt; - } - } - if ( DEBUG && consensus_indel != null ) System.out.println("DEBUG>> Returning: "+consensus_indel.getBases()+ - " (cnt="+consensus_indel.getCount()+") with total count of "+all_indel_count); - } - - - - public void printBedLine(Writer bed) { - int event_length; - if ( consensus_indel == null ) event_length = 0; - else { - event_length = consensus_indel.lengthOnRef(); - if ( event_length < 0 ) event_length = 0; - } - - StringBuffer message = new StringBuffer(); - message.append(refName+"\t"+(pos-1)+"\t"); - message.append((pos-1+event_length)+"\t"); - if ( consensus_indel != null ) { - message.append((event_length>0? "-":"+")+consensus_indel.getBases()); - } else { - message.append('.'); - } - message.append(":"+all_indel_count+"/"+total_coverage); - try { - bed.write(message.toString()+"\n"); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(bedOutput, "Error encountered while writing into output BED file", e); - } - } - - public String makeEventString() { - int event_length; - if ( consensus_indel == null ) event_length = 0; - else { - event_length = consensus_indel.lengthOnRef(); - if ( event_length < 0 ) event_length = 0; - } - StringBuffer message = new StringBuffer(); - message.append(refName); - message.append('\t'); - message.append(pos-1); - message.append('\t'); - message.append(pos-1+event_length); - message.append('\t'); - if ( consensus_indel != null ) { - message.append((event_length>0?'-':'+')); - message.append(consensus_indel.getBases()); - } else { - message.append('.'); - } - return message.toString(); - } - - public String makeStatsString(String prefix) { - StringBuilder message = new StringBuilder(); - message.append(prefix+"OBS_COUNTS[C/A/T]:"+getConsensusVariantCount()+"/"+getAllVariantCount()+"/"+getCoverage()); - message.append('\t'); - message.append(prefix+"AV_MM[C/R]:"+String.format("%.2f/%.2f",getAvConsensusMismatches(), - getAvRefMismatches())); - message.append('\t'); - message.append(prefix+"AV_MAPQ[C/R]:"+String.format("%.2f/%.2f",getAvConsensusMapq(), - getAvRefMapq())); - message.append('\t'); - message.append(prefix+"NQS_MM_RATE[C/R]:"+String.format("%.2f/%.2f",getNQSConsensusMMRate(),getNQSRefMMRate())); - message.append('\t'); - message.append(prefix+"NQS_AV_QUAL[C/R]:"+String.format("%.2f/%.2f",getNQSConsensusAvQual(),getNQSRefAvQual())); - - PrimitivePair.Int strand_cons = getConsensusStrandCounts(); - PrimitivePair.Int strand_ref = getRefStrandCounts(); - message.append('\t'); - message.append(prefix+"STRAND_COUNTS[C/C/R/R]:"+strand_cons.first+"/"+strand_cons.second+"/"+strand_ref.first+"/"+strand_ref.second); - - message.append('\t'); - message.append(prefix+"OFFSET_RSTART:"+from_start_median+"/"+from_start_mad); - message.append('\t'); - message.append(prefix+"OFFSET_REND:"+from_end_median+"/"+from_end_mad); - - return message.toString(); - - } - - /** - * Places alignment statistics into attribute map and returns the map. If attr parameter is null, - * a new map is allocated, filled and returned. If attr is not null, new attributes are added to that - * preexisting map, and the same instance of the (updated) map is returned. - * - * @param attr - * @return - */ - public Map makeStatsAttributes(Map attr) { - if ( attr == null ) attr = new HashMap(); - - VCFIndelAttributes.recordDepth(getConsensusVariantCount(),getAllVariantCount(),getCoverage(),attr); - - VCFIndelAttributes.recordAvMM(getAvConsensusMismatches(),getAvRefMismatches(),attr); - - VCFIndelAttributes.recordAvMapQ(getAvConsensusMapq(),getAvRefMapq(),attr); - - VCFIndelAttributes.recordNQSMMRate(getNQSConsensusMMRate(),getNQSRefMMRate(),attr); - - VCFIndelAttributes.recordNQSAvQ(getNQSConsensusAvQual(),getNQSRefAvQual(),attr); - - VCFIndelAttributes.recordOffsetFromStart(from_start_median,from_start_mad,attr); - - VCFIndelAttributes.recordOffsetFromEnd(from_end_median,from_end_mad,attr); - - PrimitivePair.Int strand_cons = getConsensusStrandCounts(); - PrimitivePair.Int strand_ref = getRefStrandCounts(); - - VCFIndelAttributes.recordStrandCounts(strand_cons.first,strand_cons.second,strand_ref.first,strand_ref.second,attr); - return attr; - } - } - - interface IndelListener { - public void addObservation(int pos, IndelVariant.Type t, String bases, int fromStart, int fromEnd, ExpandedSAMRecord r); - } - - class WindowContext implements IndelListener { - private Set reads; - private int start=0; // where the window starts on the ref, 1-based - private CircularArray< List< IndelVariant > > indels; - - private List emptyIndelList = new ArrayList(); - - - public WindowContext(int start, int length) { - this.start = start; - indels = new CircularArray< List >(length); -// reads = new LinkedList(); - reads = new HashSet(); - } - - /** Returns 1-based reference start position of the interval this object keeps context for. - * - * @return - */ - public int getStart() { return start; } - - /** Returns 1-based reference stop position (inclusive) of the interval this object keeps context for. - * - * @return - */ - public int getStop() { return start + indels.length() - 1; } - - /** Resets reference start position to 0 and clears the context. - * - */ - public void clear() { - start = 0; - reads.clear(); - indels.clear(); - } - - /** - * Returns true if any indel observations are present in the specified interval - * [begin,end] (1-based, inclusive). Interval can be partially of fully outside of the - * current context window: positions outside of the window will be ignored. - * @param begin - * @param end - */ - public boolean hasIndelsInInterval(long begin, long end) { - for ( long k = Math.max(start,begin); k < Math.min(getStop(),end); k++ ) { - if ( indelsAt(k) != emptyIndelList ) return true; - } - return false; - } - - public Set getReads() { return reads; } - - /** Returns the number of reads spanning over the specified reference position - * (regardless of whether they have a base or indel at that specific location). - * The second argument controls whether to count with indels in mind (this is relevant for insertions only, - * deletions do not require any special treatment since they occupy non-zero length on the ref and since - * alignment can not start or end with a deletion). For insertions, note that, internally, we assign insertions - * to the reference position right after the actual event, and we count all events assigned to a given position. - * This count (reads with indels) should be contrasted to reads without indels, or more rigorously, reads - * that support the ref rather than the indel. Few special cases may occur here: - * 1) an alignment that ends (as per getAlignmentEnd()) right before the current position but has I as its - * last element: we have to count that read into the "coverage" at the current position for the purposes of indel - * assessment, as the indel in that read will be counted at the current position, so the total coverage - * should be consistent with that. - */ - /* NOT IMPLEMENTED: 2) alsignments that start exactly at the current position do not count for the purpose of insertion - * assessment since they do not contribute any evidence to either Ref or Alt=insertion hypothesis, unless - * the alignment starts with I (so that we do have evidence for an indel assigned to the current position and - * read should be counted). For deletions, reads starting at the current position should always be counted (as they - * show no deletion=ref). - * @param refPos position on the reference; must be within the bounds of the window - */ - public int coverageAt(final long refPos, boolean countForIndels) { - int cov = 0; - for ( ExpandedSAMRecord read : reads ) { - if ( read.getSAMRecord().getAlignmentStart() > refPos || read.getSAMRecord().getAlignmentEnd() < refPos ) { - if ( countForIndels && read.getSAMRecord().getAlignmentEnd() == refPos - 1) { - Cigar c = read.getSAMRecord().getCigar(); - if ( c.getCigarElement(c.numCigarElements()-1).getOperator() == CigarOperator.I ) cov++; - } - continue; - } - cov++; - } - return cov; - } - - - /** Shifts current window to the right along the reference contig by the specified number of bases. - * The context will be updated accordingly (indels and reads that go out of scope will be dropped). - * @param offset - */ - public void shift(int offset) { - start += offset; - - indels.shiftData(offset); - if ( indels.get(0) != null && indels.get(0).size() != 0 ) { - IndelVariant indel = indels.get(0).get(0); - - System.out.println("WARNING: Indel(s) at first position in the window ("+refName+":"+start+"): currently not supported: "+ - (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; read: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName()+"; site ignored"); - indels.get(0).clear(); -// throw new StingException("Indel found at the first position ("+start+") after a shift was performed: currently not supported: "+ -// (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; reads: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName()); - } - - Iterator read_iter = reads.iterator(); - - while ( read_iter.hasNext() ) { - ExpandedSAMRecord r = read_iter.next(); - if ( r.getSAMRecord().getAlignmentEnd() < start ) { // discard reads and associated data that went out of scope - read_iter.remove(); - } - } - } - - public void add(SAMRecord read, byte [] ref) { - - if ( read.getAlignmentStart() < start ) return; // silently ignore reads starting before the window start - - ExpandedSAMRecord er = new ExpandedSAMRecord(read,ref,read.getAlignmentStart()-start,this); - //TODO duplicate records may actually indicate a problem with input bam file; throw an exception when the bug in CleanedReadInjector is fixed - if ( reads.contains(er)) return; // ignore duplicate records - reads.add(er); - } - - public void addObservation(int pos, IndelVariant.Type type, String bases, int fromStart, int fromEnd, ExpandedSAMRecord rec) { - List indelsAtSite; - try { - indelsAtSite = indels.get(pos); - } catch (IndexOutOfBoundsException e) { - SAMRecord r = rec.getSAMRecord(); - System.out.println("Failed to add indel observation, probably out of coverage window bounds (trailing indel?):\nRead "+ - r.getReadName()+": "+ - "read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+ - r.getAlignmentStart()+"; end="+r.getAlignmentEnd()+"; window start="+getStart()+ - "; window end="+getStop()); - throw e; - } - - if ( indelsAtSite == null ) { - indelsAtSite = new ArrayList(); - indels.set(pos, indelsAtSite); - } - - IndelVariant indel = null; - for ( IndelVariant v : indelsAtSite ) { - if ( ! v.equals(type, bases) ) continue; - - indel = v; - indel.addObservation(rec); - break; - } - - if ( indel == null ) { // not found: - indel = new IndelVariant(rec, type, bases); - indelsAtSite.add(indel); - } - indel.addReadPositions(fromStart,fromEnd); - } - - public List indelsAt( final long refPos ) { - List l = indels.get((int)( refPos - start )); - if ( l == null ) return emptyIndelList; - else return l; - } - - - } - - - class ExpandedSAMRecord { - private SAMRecord read; - private byte[] mismatch_flags; - private byte[] expanded_quals; - private int mms; - - public ExpandedSAMRecord(SAMRecord r, byte [] ref, long offset, IndelListener l) { - - read = r; - final long rStart = read.getAlignmentStart(); - final long rStop = read.getAlignmentEnd(); - final byte[] readBases = read.getReadString().toUpperCase().getBytes(); - - ref = new String(ref).toUpperCase().getBytes(); - - mismatch_flags = new byte[(int)(rStop-rStart+1)]; - expanded_quals = new byte[(int)(rStop-rStart+1)]; - - // now let's extract indels: - - Cigar c = read.getCigar(); - final int nCigarElems = c.numCigarElements(); - - - int readLength = 0; // length of the aligned part of the read NOT counting clipped bases - for ( CigarElement cel : c.getCigarElements() ) { - - switch(cel.getOperator()) { - case H: - case S: - case D: - case N: - case P: - break; // do not count gaps or clipped bases - case I: - case M: - readLength += cel.getLength(); - break; // advance along the gapless block in the alignment - default : - throw new IllegalArgumentException("Unexpected operator in cigar string: "+cel.getOperator()); - } - } - - int fromStart = 0; - int posOnRead = 0; - int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read: - // its start on the actual full reference contig is r.getAlignmentStart() - for ( int i = 0 ; i < nCigarElems ; i++ ) { - - final CigarElement ce = c.getCigarElement(i); - IndelVariant.Type type = null; - String indel_bases = null; - int eventPosition = posOnRef; - - switch(ce.getOperator()) { - case H: break; // hard clipped reads do not have clipped indel_bases in their sequence, so we just ignore the H element... - case I: - type = IndelVariant.Type.I; - indel_bases = read.getReadString().substring(posOnRead,posOnRead+ce.getLength()); - // will increment position on the read below, there's no 'break' statement yet... - case S: - // here we also skip soft-clipped indel_bases on the read; according to SAM format specification, - // alignment start position on the reference points to where the actually aligned - // (not clipped) indel_bases go, so we do not need to increment reference position here - posOnRead += ce.getLength(); - break; - case D: - type = IndelVariant.Type.D; - indel_bases = new String( ref, posOnRef, ce.getLength() ); - for( int k = 0 ; k < ce.getLength(); k++, posOnRef++ ) mismatch_flags[posOnRef] = expanded_quals[posOnRef] = -1; - - break; - case M: - for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) { - if ( readBases[posOnRead] != ref[posOnRef] ) { // mismatch! - mms++; - mismatch_flags[posOnRef] = 1; - } - expanded_quals[posOnRef] = read.getBaseQualities()[posOnRead]; - } - fromStart += ce.getLength(); - break; // advance along the gapless block in the alignment - default : - throw new IllegalArgumentException("Unexpected operator in cigar string: "+ce.getOperator()); - } - - if ( type == null ) continue; // element was not an indel, go grab next element... - - // we got an indel if we are here... - if ( i == 0 ) logger.debug("Indel at the start of the read "+read.getReadName()); - if ( i == nCigarElems - 1) logger.debug("Indel at the end of the read "+read.getReadName()); - - // note that here we will be assigning indels to the first deleted base or to the first - // base after insertion, not to the last base before the event! - int fromEnd = readLength - fromStart; - if ( type == IndelVariant.Type.I ) fromEnd -= ce.getLength(); - - l.addObservation((int)(offset+eventPosition), type, indel_bases, fromStart, fromEnd, this); - - if ( type == IndelVariant.Type.I ) fromStart += ce.getLength(); - - } - } - - public SAMRecord getSAMRecord() { return read; } - - public byte [] getExpandedMMFlags() { return mismatch_flags; } - - public byte [] getExpandedQuals() { return expanded_quals; } - - public int getMMCount() { return mms; } - - public boolean equals(Object o) { - if ( this == o ) return true; - if ( read == null ) return false; - if ( o instanceof SAMRecord ) return read.equals(o); - if ( o instanceof ExpandedSAMRecord ) return read.equals(((ExpandedSAMRecord)o).read); - return false; - } - - - } - -} - - -class VCFIndelAttributes { - public static String ALLELIC_DEPTH_KEY = "AD"; - public static String DEPTH_TOTAL_KEY = VCFConstants.DEPTH_KEY; - - public static String MAPQ_KEY = "MQS"; - - public static String MM_KEY = "MM"; - - public static String NQS_MMRATE_KEY = "NQSMM"; - - public static String NQS_AVQ_KEY = "NQSBQ"; - - public static String STRAND_COUNT_KEY = "SC"; - public static String RSTART_OFFSET_KEY = "RStart"; - public static String REND_OFFSET_KEY = "REnd"; - - public static Set getAttributeHeaderLines() { - Set lines = new HashSet(); - - lines.add(new VCFFormatHeaderLine(ALLELIC_DEPTH_KEY, 2, VCFHeaderLineType.Integer, "# of reads supporting consensus indel/reference at the site")); - lines.add(new VCFFormatHeaderLine(DEPTH_TOTAL_KEY, 1, VCFHeaderLineType.Integer, "Total coverage at the site")); - - lines.add(new VCFFormatHeaderLine(MAPQ_KEY, 2, VCFHeaderLineType.Float, "Average mapping qualities of consensus indel-supporting reads/reference-supporting reads")); - - lines.add(new VCFFormatHeaderLine(MM_KEY, 2, VCFHeaderLineType.Float, "Average # of mismatches per consensus indel-supporting read/per reference-supporting read")); - - lines.add(new VCFFormatHeaderLine(NQS_MMRATE_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: fraction of mismatching bases in consensus indel-supporting reads/in reference-supporting reads")); - - lines.add(new VCFFormatHeaderLine(NQS_AVQ_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: average quality of bases from consensus indel-supporting reads/from reference-supporting reads")); - - lines.add(new VCFFormatHeaderLine(STRAND_COUNT_KEY, 4, VCFHeaderLineType.Integer, "Strandness: counts of forward-/reverse-aligned indel-supporting reads / forward-/reverse-aligned reference supporting reads")); - - lines.add(new VCFFormatHeaderLine(RSTART_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the starts of the reads")); - lines.add(new VCFFormatHeaderLine(REND_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the ends of the reads")); - - return lines; - } - - public static Map recordStrandCounts(int cnt_cons_fwd, int cnt_cons_rev, int cnt_ref_fwd, int cnt_ref_rev, Map attrs) { - attrs.put(STRAND_COUNT_KEY, new Integer[] {cnt_cons_fwd, cnt_cons_rev, cnt_ref_fwd, cnt_ref_rev} ); - return attrs; - } - - public static Map recordDepth(int cnt_cons, int cnt_indel, int cnt_total, Map attrs) { - attrs.put(ALLELIC_DEPTH_KEY, new Integer[] {cnt_cons, cnt_indel} ); - attrs.put(DEPTH_TOTAL_KEY, cnt_total); - return attrs; - } - - public static Map recordAvMapQ(double cons, double ref, Map attrs) { - attrs.put(MAPQ_KEY, new Float[] {(float)cons, (float)ref} ); - return attrs; - } - - public static Map recordAvMM(double cons, double ref, Map attrs) { - attrs.put(MM_KEY, new Float[] {(float)cons, (float)ref} ); - return attrs; - } - - public static Map recordNQSMMRate(double cons, double ref, Map attrs) { - attrs.put(NQS_MMRATE_KEY, new Float[] {(float)cons, (float)ref} ); - return attrs; - } - - public static Map recordNQSAvQ(double cons, double ref, Map attrs) { - attrs.put(NQS_AVQ_KEY, new Float[] {(float)cons, (float)ref} ); - return attrs; - } - - public static Map recordOffsetFromStart(int median, int mad, Map attrs) { - attrs.put(RSTART_OFFSET_KEY, new Integer[] {median, mad} ); - return attrs; - } - - public static Map recordOffsetFromEnd(int median, int mad, Map attrs) { - attrs.put(REND_OFFSET_KEY, new Integer[] {median, mad} ); - return attrs; - } -} +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.samtools.*; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; +import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter; +import org.broadinstitute.sting.gatk.filters.Platform454Filter; +import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter; +import org.broadinstitute.sting.gatk.filters.PlatformUnitFilterHelper; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; +import org.broadinstitute.sting.gatk.refdata.features.refseq.Transcript; +import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec; +import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.collections.CircularArray; +import org.broadinstitute.sting.utils.collections.PrimitivePair; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.interval.OverlappingIntervalIterator; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; +import java.util.*; + +/** + * This is a simple, counts-and-cutoffs based tool for calling indels from aligned (preferrably MSA cleaned) sequencing + * data. Two output formats supported are: BED format (minimal output, required), and extended output that includes read + * and mismtach statistics around the calls (tuned on with --verbose). The calls can be performed from a single/pooled sample, + * or from a matched pair of samples (with --somatic option). In the latter case, two input bam files must be specified, + * the order is important: indels are called from the second sample ("Tumor") and additionally annotated as germline + * if even a weak evidence for the same indel, not necessarily a confident call, exists in the first sample ("Normal"), or as somatic + * if first bam has coverage at the site but no indication for an indel. In the --somatic mode, BED output contains + * only somatic calls, while --verbose output contains all calls annotated with GERMLINE/SOMATIC keywords. + */ +@ReadFilters({Platform454Filter.class, MappingQualityZeroReadFilter.class, PlatformUnitFilter.class}) +public class SomaticIndelDetectorWalker extends ReadWalker { +// @Output +// PrintStream out; + @Output(doc="File to which variants should be written",required=true) + protected VCFWriter vcf_writer = null; + + @Argument(fullName="outputFile", shortName="O", doc="output file name (BED format). DEPRECATED> Use --bed", required=true) + @Deprecated + java.io.File output_file; + + @Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print callability metrics output", required = false) + public PrintStream metricsWriter = null; + +// @Argument(fullName="vcf_format", shortName="vcf", doc="generate output file in VCF format", required=false) +// boolean FORMAT_VCF = false; + + @Hidden + @Argument(fullName = "genotype_intervals", shortName = "genotype", + doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false) + public String genotypeIntervalsFile = null; + + @Hidden + @Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false, + doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+ + "if the list turns out to be unsorted, it will throw an exception. "+ + "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+ + "to sort and keep it in memory (increases memory usage!).") + protected boolean GENOTYPE_NOT_SORTED = false; + + @Hidden + @Argument(fullName="unpaired", shortName="unpaired", + doc="Perform unpaired calls (no somatic status detection)", required=false) + boolean call_unpaired = false; + boolean call_somatic ; + + @Argument(fullName="verboseOutput", shortName="verbose", + doc="Verbose output file in text format", required=false) + java.io.File verboseOutput = null; + + @Argument(fullName="bedOutput", shortName="bed", + doc="Lightweight bed output file (only positions and events, no stats/annotations)", required=false) + java.io.File bedOutput = null; + + @Argument(fullName="minCoverage", shortName="minCoverage", + doc="indel calls will be made only at sites with coverage of minCoverage or more reads; with --somatic this value is applied to tumor sample", required=false) + int minCoverage = 6; + + @Argument(fullName="minNormalCoverage", shortName="minNormalCoverage", + doc="used only with --somatic; normal sample must have at least minNormalCoverage or more reads at the site to call germline/somatic indel, otherwise the indel (in tumor) is ignored", required=false) + int minNormalCoverage = 4; + + @Argument(fullName="minFraction", shortName="minFraction", + doc="Minimum fraction of reads with CONSENSUS indel at a site, out of all reads covering the site, required for making a call"+ + " (fraction of non-consensus indels at the site is not considered here, see minConsensusFraction)", required=false) + double minFraction = 0.3; + + @Argument(fullName="minConsensusFraction", shortName="minConsensusFraction", + doc="Indel call is made only if fraction of CONSENSUS indel observations at a site wrt all indel observations at the site exceeds this threshold", required=false) + double minConsensusFraction = 0.7; + + @Argument(fullName="minIndelCount", shortName="minCnt", + doc="Minimum count of reads supporting consensus indel required for making the call. "+ + " This filter supercedes minFraction, i.e. indels with acceptable minFraction at low coverage "+ + "(minIndelCount not met) will not pass.", required=false) + int minIndelCount = 0; + + @Argument(fullName="refseq", shortName="refseq", + doc="Name of RefSeq transcript annotation file. If specified, indels will be annotated with GENOMIC/UTR/INTRON/CODING and with the gene name", required=false) + String RefseqFileName = null; + + @Argument(fullName="blacklistedLanes", shortName="BL", + doc="Name of lanes (platform units) that should be ignored. Reads coming from these lanes will never be seen "+ + "by this application, so they will not contribute indels to consider and will not be counted.", required=false) + PlatformUnitFilterHelper dummy; + @Argument(fullName="indel_debug", shortName="idebug", doc="Detailed printout for debugging, do not turn this on",required=false) Boolean DEBUG = false; + @Argument(fullName="window_size", shortName="ws", doc="Size (bp) of the sliding window used for accumulating the coverage. "+ + "May need to be increased to accomodate longer reads or longer deletions.",required=false) int WINDOW_SIZE = 200; + @Argument(fullName="maxNumberOfReads",shortName="mnr",doc="Maximum number of reads to cache in the window; if number of reads exceeds this number,"+ + " the window will be skipped and no calls will be made from it",required=false) int MAX_READ_NUMBER = 10000; + + private WindowContext tumor_context; + private WindowContext normal_context; + private int currentContigIndex = -1; + private int contigLength = -1; // we see to much messy data with reads hanging out of contig ends... + private int currentPosition = -1; // position of the last read we've seen on the current contig + private String refName = null; + private java.io.Writer output = null; + private GenomeLoc location = null; + private long normalCallsMade = 0L, tumorCallsMade = 0L; + + boolean outOfContigUserWarned = false; + + private LocationAwareSeekableRODIterator refseqIterator=null; + +// private Set normalReadGroups; // we are going to remember which read groups are normals and which are tumors in order to be able +// private Set tumorReadGroups ; // to properly assign the reads coming from a merged stream + private Set normalSamples; // we are going to remember which samples are normal and which are tumor: + private Set tumorSamples ; // these are used only to generate genotypes for vcf output + + private int NQS_WIDTH = 5; // 5 bases on each side of the indel for NQS-style statistics + + private Writer bedWriter = null; + private Writer verboseWriter = null; + + + private static String annGenomic = "GENOMIC"; + private static String annIntron = "INTRON"; + private static String annUTR = "UTR"; + private static String annCoding = "CODING"; + private static String annUnknown = "UNKNOWN"; + + enum CallType { + NOCOVERAGE, + BADCOVERAGE, + NOEVIDENCE, + GERMLINE, + SOMATIC + }; + + private SAMRecord lastRead; + private byte[] refBases; + private ReferenceDataSource refData; + private Iterator genotypeIntervalIterator = null; + + // the current interval in the list of intervals, for which we want to do full genotyping + private GenomeLoc currentGenotypeInterval = null; + private long lastGenotypedPosition = -1; // last position on the currentGenotypeInterval, for which a call was already printed; + // can be 1 base before lastGenotyped start + + + // "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt" + + private Set getVCFHeaderInfo() { + Set headerInfo = new HashSet(); + + // first, the basic info + headerInfo.add(new VCFHeaderLine("source", "IndelGenotyperV2")); + headerInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); + + // FORMAT and INFO fields +// headerInfo.addAll(VCFUtils.getSupportedHeaderStrings()); + + headerInfo.addAll(VCFIndelAttributes.getAttributeHeaderLines()); + if ( call_somatic ) { + headerInfo.add(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); + } else { + } + + // all of the arguments from the argument collection + Set args = new HashSet(); + args.add(this); + args.addAll(getToolkit().getFilters()); + Map commandLineArgs = getToolkit().getApproximateCommandLineArguments(args); + for ( Map.Entry commandLineArg : commandLineArgs.entrySet() ) + headerInfo.add(new VCFHeaderLine(String.format("IGv2_%s", commandLineArg.getKey()), commandLineArg.getValue())); + // also, the list of input bams + for ( String fileName : getToolkit().getArguments().samFiles ) + headerInfo.add(new VCFHeaderLine("IGv2_bam_file_used", fileName)); + + return headerInfo; + } + + + @Override + public void initialize() { + + call_somatic = (call_unpaired ? false : true); + normal_context = new WindowContext(0,WINDOW_SIZE); + normalSamples = new HashSet(); + + if ( bedOutput != null && output_file != null ) { + throw new UserException.DeprecatedArgument("-O", "-O option is deprecated and -bed option replaces it; you can not use both at the same time"); + } + + if ( RefseqFileName != null ) { + logger.info("Using RefSeq annotations from "+RefseqFileName); + + RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), + getToolkit().getGenomeLocParser(), + getToolkit().getArguments().unsafe); + RMDTrack refseq = builder.createInstanceOfTrack(RefSeqCodec.class,new File(RefseqFileName)); + + refseqIterator = new SeekableRODIterator(refseq.getHeader(), + refseq.getSequenceDictionary(), + getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), + getToolkit().getGenomeLocParser(), + refseq.getIterator()); + } + + if ( refseqIterator == null ) logger.info("No gene annotations available"); + + int nSams = getToolkit().getArguments().samFiles.size(); + + if ( call_somatic ) { + if ( nSams < 2 ) throw new UserException.BadInput("In default (paired sample) mode at least two bam files (normal and tumor) must be specified"); + tumor_context = new WindowContext(0,WINDOW_SIZE); + tumorSamples = new HashSet(); + } + + int nNorm = 0; + int nTum = 0; + for ( SAMReaderID rid : getToolkit().getReadsDataSource().getReaderIDs() ) { + Tags tags = rid.getTags() ; + if ( tags.getPositionalTags().isEmpty() && call_somatic ) + throw new UserException.BadInput("In default (paired sample) mode all input bam files must be tagged as either 'normal' or 'tumor'. Untagged file: "+ + getToolkit().getSourceFileForReaderID(rid)); + boolean normal = false; + boolean tumor = false; + for ( String s : tags.getPositionalTags() ) { // we allow additional unrelated tags (and we do not use them), but we REQUIRE one of Tumor/Normal to be present if --somatic is on + if ( "NORMAL".equals(s.toUpperCase()) ) { + normal = true; + nNorm++; + } + if ( "TUMOR".equals(s.toUpperCase()) ) { + tumor = true; + nTum++ ; + } + } + if ( call_somatic && normal && tumor ) throw new UserException.BadInput("Input bam file "+ + getToolkit().getSourceFileForReaderID(rid)+" is tagged both as normal and as tumor. Which one is it??"); + if ( call_somatic && !normal && ! tumor ) + throw new UserException.BadInput("In somatic mode all input bams must be tagged as either normal or tumor. Encountered untagged file: "+ + getToolkit().getSourceFileForReaderID(rid)); + if ( ! call_somatic && (normal || tumor) ) + System.out.println("WARNING: input bam file "+getToolkit().getSourceFileForReaderID(rid) + +" is tagged as Normal and/or Tumor, but somatic mode is not on. Tags will ne IGNORED"); + if ( call_somatic && tumor ) { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) { + tumorSamples.add(rg.getSample()); + } + } else { + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) { + normalSamples.add(rg.getSample()); + } + } + if ( genotypeIntervalsFile != null ) { + + if ( ! GENOTYPE_NOT_SORTED && IntervalUtils.isIntervalFile(genotypeIntervalsFile)) { + // prepare to read intervals one-by-one, as needed (assuming they are sorted). + genotypeIntervalIterator = new IntervalFileMergingIterator(getToolkit().getGenomeLocParser(), + new java.io.File(genotypeIntervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); + } else { + // read in the whole list of intervals for cleaning + GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(), + IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(genotypeIntervalsFile),true), IntervalMergingRule.OVERLAPPING_ONLY); + genotypeIntervalIterator = locs.iterator(); + } + + // wrap intervals requested for genotyping inside overlapping iterator, so that we actually + // genotype only on the intersections of the requested intervals with the -L intervals + genotypeIntervalIterator = new OverlappingIntervalIterator(genotypeIntervalIterator, getToolkit().getIntervals().iterator() ); + + currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; + + if ( DEBUG) System.out.println("DEBUG>> first genotyping interval="+currentGenotypeInterval); + + if ( currentGenotypeInterval != null ) lastGenotypedPosition = currentGenotypeInterval.getStart()-1; + } + + } + + location = getToolkit().getGenomeLocParser().createGenomeLoc(getToolkit().getSAMFileHeader().getSequence(0).getSequenceName(),1); + + normalSamples = getToolkit().getSamplesByReaders().get(0); + + try { + // we already checked that bedOutput and output_file are not set simultaneously + if ( bedOutput != null ) bedWriter = new FileWriter(bedOutput); + if ( output_file != null ) bedWriter = new FileWriter(output_file); + } catch (java.io.IOException e) { + throw new UserException.CouldNotReadInputFile(bedOutput, "Failed to open BED file for writing.", e); + } + try { + if ( verboseOutput != null ) verboseWriter = new FileWriter(verboseOutput); + } catch (java.io.IOException e) { + throw new UserException.CouldNotReadInputFile(verboseOutput, "Failed to open BED file for writing.", e); + } + + vcf_writer.writeHeader(new VCFHeader(getVCFHeaderInfo(), SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()))) ; + refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile); + } + + + @Override + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + + // if ( read.getReadName().equals("428EFAAXX090610:2:36:1384:639#0") ) System.out.println("GOT READ"); + + if ( DEBUG ) { + // System.out.println("DEBUG>> read at "+ read.getAlignmentStart()+"-"+read.getAlignmentEnd()+ + // "("+read.getCigarString()+")"); + if ( read.getDuplicateReadFlag() ) System.out.println("DEBUG>> Duplicated read (IGNORED)"); + } + + if ( AlignmentUtils.isReadUnmapped(read) || + read.getDuplicateReadFlag() || + read.getNotPrimaryAlignmentFlag() || + read.getMappingQuality() == 0 ) { + return 0; // we do not need those reads! + } + + if ( read.getReferenceIndex() != currentContigIndex ) { + // we just jumped onto a new contig + if ( DEBUG ) System.out.println("DEBUG>>> Moved to contig "+read.getReferenceName()); + if ( read.getReferenceIndex() < currentContigIndex ) // paranoidal + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, read, "Read "+read.getReadName()+": contig is out of order; input BAM file is unsorted"); + + // print remaining indels from the previous contig (if any); + if ( call_somatic ) emit_somatic(1000000000, true); + else emit(1000000000,true); + + currentContigIndex = read.getReferenceIndex(); + currentPosition = read.getAlignmentStart(); + refName = new String(read.getReferenceName()); + + location = getToolkit().getGenomeLocParser().createGenomeLoc(refName,location.getStart(),location.getStop()); + contigLength = getToolkit().getGenomeLocParser().getContigInfo(refName).getSequenceLength(); + outOfContigUserWarned = false; + + lastGenotypedPosition = -1; + + normal_context.clear(); // reset coverage window; this will also set reference position to 0 + if ( call_somatic) tumor_context.clear(); + + refBases = new String(refData.getReference().getSequence(read.getReferenceName()).getBases()).toUpperCase().getBytes(); + } + + // we have reset the window to the new contig if it was required and emitted everything we collected + // on a previous contig. At this point we are guaranteed that we are set up properly for working + // with the contig of the current read. + + // NOTE: all the sanity checks and error messages below use normal_context only. We make sure that normal_context and + // tumor_context are synchronized exactly (windows are always shifted together by emit_somatic), so it's safe + + if ( read.getAlignmentStart() < currentPosition ) // oops, read out of order? + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, read, "Read "+read.getReadName() +" out of order on the contig\n"+ + "Read starts at "+refName+":"+read.getAlignmentStart()+"; last read seen started at "+refName+":"+currentPosition + +"\nLast read was: "+lastRead.getReadName()+" RG="+lastRead.getAttribute("RG")+" at "+lastRead.getAlignmentStart()+"-" + +lastRead.getAlignmentEnd()+" cigar="+lastRead.getCigarString()); + + currentPosition = read.getAlignmentStart(); + lastRead = read; + + if ( read.getAlignmentEnd() > contigLength ) { + if ( ! outOfContigUserWarned ) { + System.out.println("WARNING: Reads aligned past contig length on "+ location.getContig()+"; all such reads will be skipped"); + outOfContigUserWarned = true; + } + return 0; + } + + long alignmentEnd = read.getAlignmentEnd(); + Cigar c = read.getCigar(); + int lastNonClippedElement = 0; // reverse offset to the last unclipped element + CigarOperator op = null; + // moving backwards from the end of the cigar, skip trailing S or H cigar elements: + do { + lastNonClippedElement++; + op = c.getCigarElement( c.numCigarElements()-lastNonClippedElement ).getOperator(); + } while ( op == CigarOperator.H || op == CigarOperator.S ); + + // now op is the last non-S/H operator in the cigar. + + // a little trick here: we want to make sure that current read completely fits into the current + // window so that we can accumulate indel observations over the whole length of the read. + // The ::getAlignmentEnd() method returns the last position on the reference where bases from the + // read actually match (M cigar elements). After our cleaning procedure, we can have reads that end + // with I element, which is not gonna be counted into alignment length on the reference. On the other hand, + // in this program we assign insertions, internally, to the first base *after* the insertion position. + // Hence, we have to make sure that that extra base is already in the window or we will get IndexOutOfBounds. + + if ( op == CigarOperator.I) alignmentEnd++; + + if ( alignmentEnd > normal_context.getStop()) { + + // we don't emit anything until we reach a read that does not fit into the current window. + // At that point we try shifting the window to the start of that read (or reasonably close) and emit everything prior to + // that position. This is legitimate, since the reads are sorted and we are not gonna see any more coverage at positions + // below the current read's start. + // Clearly, we assume here that window is large enough to accomodate any single read, so simply shifting + // the window to around the read's start will ensure that the read fits... + + if ( DEBUG) System.out.println("DEBUG>> Window at "+normal_context.getStart()+"-"+normal_context.getStop()+", read at "+ + read.getAlignmentStart()+": trying to emit and shift" ); + if ( call_somatic ) emit_somatic( read.getAlignmentStart(), false ); + else emit( read.getAlignmentStart(), false ); + + // let's double check now that the read fits after the shift + if ( read.getAlignmentEnd() > normal_context.getStop()) { + // ooops, looks like the read does not fit into the window even after the latter was shifted!! + throw new UserException.BadArgumentValue("window_size", "Read "+read.getReadName()+": out of coverage window bounds. Probably window is too small, so increase the value of the window_size argument.\n"+ + "Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+ + read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+ + "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop()); + } + } + + if ( call_somatic ) { + + Tags tags = getToolkit().getReaderIDForRead(read).getTags(); + boolean assigned = false; + for ( String s : tags.getPositionalTags() ) { + if ( "NORMAL".equals(s.toUpperCase()) ) { + normal_context.add(read,ref.getBases()); + assigned = true; + break; + } + if ( "TUMOR".equals(s.toUpperCase()) ) { + tumor_context.add(read,ref.getBases()); + assigned = true; + break; + } + } + if ( ! assigned ) + throw new StingException("Read "+read.getReadName()+" from "+getToolkit().getSourceFileForReaderID(getToolkit().getReaderIDForRead(read))+ + "has no Normal/Tumor tag associated with it"); + +// String rg = (String)read.getAttribute("RG"); +// if ( rg == null ) +// throw new UserException.MalformedBam(read, "Read "+read.getReadName()+" has no read group in merged stream. RG is required for somatic calls."); + +// if ( normalReadGroups.contains(rg) ) { +// normal_context.add(read,ref.getBases()); +// } else if ( tumorReadGroups.contains(rg) ) { +// tumor_context.add(read,ref.getBases()); +// } else { +// throw new UserException.MalformedBam(read, "Unrecognized read group in merged stream: "+rg); +// } + + if ( tumor_context.getReads().size() > MAX_READ_NUMBER ) { + System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ + refName+':'+tumor_context.getStart()+'-'+tumor_context.getStop()+" in tumor sample. The whole window will be dropped."); + tumor_context.shift(WINDOW_SIZE); + normal_context.shift(WINDOW_SIZE); + } + if ( normal_context.getReads().size() > MAX_READ_NUMBER ) { + System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ + refName+':'+normal_context.getStart()+'-'+normal_context.getStop()+" in normal sample. The whole window will be dropped"); + tumor_context.shift(WINDOW_SIZE); + normal_context.shift(WINDOW_SIZE); + } + + + } else { + normal_context.add(read, ref.getBases()); + if ( normal_context.getReads().size() > MAX_READ_NUMBER ) { + System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+ + refName+':'+normal_context.getStart()+'-'+normal_context.getStop()+". The whole window will be dropped"); + normal_context.shift(WINDOW_SIZE); + } + } + + return 1; + } + + /** An auxiliary shortcut: returns true if position(location.getContig(), p) is past l */ + private boolean pastInterval(long p, GenomeLoc l) { + return ( location.getContigIndex() > l.getContigIndex() || + location.getContigIndex() == l.getContigIndex() && p > l.getStop() ); + } + + /** Emit calls of the specified type across genotyping intervals, from position lastGenotypedPosition+1 to + * pos-1, inclusive. + * @param contigIndex + * @param pos + * @param call + */ + /* + private void emitNoCallsUpTo(int contigIndex, long pos, CallType call) { + + if ( contigIndex < currentGenotypeInterval.getContigIndex() || + contigIndex == currentGenotypeInterval.getContigIndex() && pos <= currentGenotypeInterval.getStart() ) return; + + if ( contigIndex == currentGenotypeInterval.getContigIndex() && pos >= currentGenotypeInterval.getStart() ) { + for ( long p = lastGenotypedPosition+1; p < pos; p++ ) { + + } + } + while( currentGenotypeInterval != null ) { + + while ( ) + if ( genotypeIntervalIterator.hasNext() ) { + currentGenotypeInterval = genotypeIntervalIterator.next() ; + if ( pastInterval(p,currentGenotypeInterval) ) { + // if we are about to jump over the whole next interval, we need to emit NO_COVERAGE calls there! + emitNoCoverageCalls(currentGenotypeInterval); + } + } else { + currentGenotypeInterval = null; + } + } + } +*/ + + /** Output indel calls up to the specified position and shift the window: after this method is executed, the + * first element of the window maps onto 'position', if possible, or at worst a few bases to the left of 'position' if we may need more + * reads to get full NQS-style statistics for an indel in the close proximity of 'position'. + * + * @param position + */ + private void emit(long position, boolean force) { + + long adjustedPosition = adjustPosition(position); + + if ( adjustedPosition == -1 ) { + // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether + normal_context.shift((int)(position-normal_context.getStart())); + return; + } + long move_to = adjustedPosition; + + for ( int pos = normal_context.getStart() ; pos < Math.min(adjustedPosition,normal_context.getStop()+1) ; pos++ ) { + + boolean genotype = false; + // first let's see if we need to genotype current position: + + final long p = pos - 1; // our internally used positions (pos) are +1 compared to external format spec (e.g. vcf) + + if ( pos <= lastGenotypedPosition ) continue; + + while ( currentGenotypeInterval != null ) { + + // if we did not even reach next interval yet, no genotyping at current position: + if ( location.getContigIndex() < currentGenotypeInterval.getContigIndex() || + location.getContigIndex() == currentGenotypeInterval.getContigIndex() && + p < currentGenotypeInterval.getStart() ) break; + if ( pastInterval(p, currentGenotypeInterval) ) { + // we are past current genotyping interval, so we are done with it; let's load next interval: + currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; + continue; // re-enter the loop to check against the interval we just loaded + } + + // we reach this point only if p is inside current genotyping interval; set the flag and bail out: + genotype = true; + break; + } + +// if ( DEBUG ) System.out.println("DEBUG>> pos="+pos +"; genotyping interval="+currentGenotypeInterval+"; genotype="+genotype); + + if ( normal_context.indelsAt(pos).size() == 0 && ! genotype ) continue; + + IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH); + + if ( normalCall.getCoverage() < minCoverage && ! genotype ) { + if ( DEBUG ) { + System.out.println("DEBUG>> Indel at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); + } + continue; // low coverage + } + + if ( DEBUG ) System.out.println("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" at "+pos); + + long left = Math.max( pos-NQS_WIDTH, normal_context.getStart() ); + long right = pos+( normalCall.getVariant() == null ? 0 : normalCall.getVariant().lengthOnRef())+NQS_WIDTH-1; + + if ( right >= adjustedPosition && ! force) { + // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect + + // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; + // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage + move_to = adjustPosition(left); + if ( move_to == -1 ) { + // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether + normal_context.shift((int)(adjustedPosition-normal_context.getStart())); + return; + } + if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ move_to); + break; + } + + // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right: + if ( right > normal_context.getStop() ) right = normal_context.getStop(); + + // location = getToolkit().getGenomeLocParser().setStart(location,pos); + // location = getToolkit().getGenomeLocParser().setStop(location,pos); // retrieve annotation data + + location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(), pos); + + boolean haveCall = normalCall.isCall(); // cache the value + + if ( haveCall || genotype) { + if ( haveCall ) normalCallsMade++; + printVCFLine(vcf_writer,normalCall); + if ( bedWriter != null ) normalCall.printBedLine(bedWriter); + if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall); + lastGenotypedPosition = pos; + } + + normal_context.indelsAt(pos).clear(); + // we dealt with this indel; don't want to see it again + // (we might otherwise in the case when 1) there is another indel that follows + // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) + +// for ( IndelVariant var : variants ) { +// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount()); +// } + } + + if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")"); + normal_context.shift((int)(move_to - normal_context.getStart() ) ); + } + + /** A shortcut. Returns true if we got indels within the specified interval in single and only window context + * (for single-sample calls) or in either of the two window contexts (for two-sample/somatic calls) + * + */ + private boolean indelsPresentInInterval(long start, long stop) { + if ( tumor_context == null ) return normal_context.hasIndelsInInterval(start,stop); + return tumor_context.hasIndelsInInterval(start,stop) || + normal_context.hasIndelsInInterval(start,stop); + } + /** Takes the position, to which window shift is requested, and tries to adjust it in such a way that no NQS window is broken. + * Namely, this method checks, iteratively, if there is an indel within NQS_WIDTH bases ahead of initially requested or adjusted + * shift position. If there is such an indel, + * then shifting to that position would lose some or all NQS-window bases to the left of the indel (since it's not going to be emitted + * just yet). Instead, this method tries to readjust the shift position leftwards so that full NQS window to the left of the next indel + * is preserved. This method tries thie strategy 4 times (so that it would never walk away too far to the left), and if it fails to find + * an appropriate adjusted shift position (which could happen if there are many indels following each other at short intervals), it will give up, + * go back to the original requested shift position and try finding the first shift poisition that has no indel associated with it. + */ + + private long adjustPosition(long request) { + long initial_request = request; + int attempts = 0; + boolean failure = false; + while ( indelsPresentInInterval(request,request+NQS_WIDTH) ) { + request -= NQS_WIDTH; + if ( DEBUG ) System.out.println("DEBUG>> indel observations present within "+NQS_WIDTH+" bases ahead. Resetting shift to "+request); + attempts++; + if ( attempts == 4 ) { + if ( DEBUG ) System.out.println("DEBUG>> attempts to preserve full NQS window failed; now trying to find any suitable position.") ; + failure = true; + break; + } + } + + if ( failure ) { + // we tried 4 times but did not find a good shift position that would preserve full nqs window + // around all indels. let's fall back and find any shift position as long and there's no indel at the very + // first position after the shift (this is bad for other reasons); if it breaks a nqs window, so be it + request = initial_request; + attempts = 0; + while ( indelsPresentInInterval(request,request+1) ) { + request--; + if ( DEBUG ) System.out.println("DEBUG>> indel observations present within "+NQS_WIDTH+" bases ahead. Resetting shift to "+request); + attempts++; + if ( attempts == 50 ) { + System.out.println("WARNING: Indel at every position in the interval "+refName+":"+request+"-"+initial_request+ + ". Can not find a break to shift context window to; no calls will be attempted in the current window."); + return -1; + } + } + } + if ( DEBUG ) System.out.println("DEBUG>> Found acceptable target position "+request); + return request; + } + + /** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed + * first elements of the coverage arrays map onto 'position', or a few bases prior to the specified position + * if there is an indel in close proximity to 'position' so that we may get more coverage around it later. + * + * @param position + */ + private void emit_somatic(long position, boolean force) { + + long adjustedPosition = adjustPosition(position); + if ( adjustedPosition == -1 ) { + // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether + normal_context.shift((int)(position-normal_context.getStart())); + tumor_context.shift((int)(position-tumor_context.getStart())); + return; + } + long move_to = adjustedPosition; + + if ( DEBUG ) System.out.println("DEBUG>> Emitting in somatic mode up to "+position+" force shift="+force+" current window="+tumor_context.getStart()+"-"+tumor_context.getStop()); + + for ( int pos = tumor_context.getStart() ; pos < Math.min(adjustedPosition,tumor_context.getStop()+1) ; pos++ ) { + + boolean genotype = false; + // first let's see if we need to genotype current position: + + final long p = pos - 1; // our internally used positions (pos) are +1 compared to external format spec (e.g. vcf) + + if ( pos <= lastGenotypedPosition ) continue; + + while ( currentGenotypeInterval != null ) { + + // if we did not even reach next interval yet, no genotyping at current position: + if ( location.getContigIndex() < currentGenotypeInterval.getContigIndex() || + location.getContigIndex() == currentGenotypeInterval.getContigIndex() && + p < currentGenotypeInterval.getStart() ) break; + if ( pastInterval(p, currentGenotypeInterval) ) { + // we are past current genotyping interval, so we are done with it; let's load next interval: + currentGenotypeInterval = genotypeIntervalIterator.hasNext() ? genotypeIntervalIterator.next() : null; + continue; // re-enter the loop to check against the interval we just loaded + } + + // we reach tjis point only if p is inside current genotyping interval; set the flag and bail out: + genotype = true; + break; + } +// if ( DEBUG) System.out.println("DEBUG>> pos="+pos +"; genotyping interval="+currentGenotypeInterval+"; genotype="+genotype); + + if ( tumor_context.indelsAt(pos).size() == 0 && ! genotype ) continue; // no indels in tumor + + if ( DEBUG && genotype ) System.out.println("DEBUG>> Genotyping requested at "+pos); + + IndelPrecall tumorCall = new IndelPrecall(tumor_context,pos,NQS_WIDTH); + IndelPrecall normalCall = new IndelPrecall(normal_context,pos,NQS_WIDTH); + + if ( tumorCall.getCoverage() < minCoverage && ! genotype ) { + if ( DEBUG ) { + System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in tumor="+tumorCall.getCoverage()+" (SKIPPED)"); + } + continue; // low coverage + } + if ( normalCall.getCoverage() < minNormalCoverage && ! genotype ) { + if ( DEBUG ) { + System.out.println("DEBUG>> Indel in tumor at "+pos+"; coverare in normal="+normalCall.getCoverage()+" (SKIPPED)"); + } + continue; // low coverage + } + + if ( DEBUG ) { + System.out.print("DEBUG>> "+(tumorCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in tumor, "); + System.out.print("DEBUG>> "+(normalCall.getAllVariantCount() == 0?"No Indel":"Indel")+" in normal at "+pos); + } + + long left = Math.max( pos-NQS_WIDTH, tumor_context.getStart() ); + long right = pos+ ( tumorCall.getVariant() == null ? 0 : tumorCall.getVariant().lengthOnRef() )+NQS_WIDTH-1; + + if ( right >= adjustedPosition && ! force) { + // we are not asked to force-shift, and there is more coverage around the current indel that we still need to collect + + // we are not asked to force-shift, and there's still additional coverage to the right of current indel, so its too early to emit it; + // instead we shift only up to current indel pos - MISMATCH_WIDTH, so that we could keep collecting that coverage + move_to = adjustPosition(left); + if ( move_to == -1 ) { + // failed to find appropriate shift position, the data are probably to messy anyway so we drop them altogether + normal_context.shift((int)(adjustedPosition-normal_context.getStart())); + tumor_context.shift((int)(adjustedPosition-tumor_context.getStart())); + return; + } + if ( DEBUG ) System.out.println("DEBUG>> waiting for coverage; actual shift performed to "+ move_to); + break; + } + + if ( right > tumor_context.getStop() ) right = tumor_context.getStop(); // if indel is too close to the end of the window but we need to emit anyway (force-shift), adjust right + +// location = getToolkit().getGenomeLocParser().setStart(location,pos); +// location = getToolkit().getGenomeLocParser().setStop(location,pos); // retrieve annotation data + + location = getToolkit().getGenomeLocParser().createGenomeLoc(location.getContig(),pos); // retrieve annotation data + + boolean haveCall = tumorCall.isCall(); // cache the value + + if ( haveCall || genotype ) { + if ( haveCall ) tumorCallsMade++; + + printVCFLine(vcf_writer,normalCall,tumorCall); + + if ( bedWriter != null ) tumorCall.printBedLine(bedWriter); + + if ( verboseWriter != null ) printVerboseLine(verboseWriter, normalCall, tumorCall ); + lastGenotypedPosition = pos; + } + tumor_context.indelsAt(pos).clear(); + normal_context.indelsAt(pos).clear(); + // we dealt with this indel; don't want to see it again + // (we might otherwise in the case when 1) there is another indel that follows + // within MISMATCH_WIDTH bases and 2) we'd need to wait for more coverage for that next indel) + +// for ( IndelVariant var : variants ) { +// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount()); +// } + } + + if ( DEBUG ) System.out.println("DEBUG>> Actual shift to " + move_to + " ("+adjustedPosition+")"); + tumor_context.shift((int)(move_to - tumor_context.getStart() ) ); + normal_context.shift((int)(move_to - normal_context.getStart() ) ); + } + + private String makeFullRecord(IndelPrecall normalCall, IndelPrecall tumorCall) { + StringBuilder fullRecord = new StringBuilder(); + if ( tumorCall.getVariant() != null || normalCall.getVariant() == null) { + fullRecord.append(tumorCall.makeEventString()); + } else { + fullRecord.append(normalCall.makeEventString()); + } + fullRecord.append('\t'); + fullRecord.append(normalCall.makeStatsString("N_")); + fullRecord.append('\t'); + fullRecord.append(tumorCall.makeStatsString("T_")); + fullRecord.append('\t'); + return fullRecord.toString(); + } + + private String makeFullRecord(IndelPrecall normalCall) { + StringBuilder fullRecord = new StringBuilder(); + fullRecord.append(normalCall.makeEventString()); + fullRecord.append('\t'); + fullRecord.append(normalCall.makeStatsString("")); + fullRecord.append('\t'); + return fullRecord.toString(); + } + + private String getAnnotationString(RODRecordList ann) { + if ( ann == null ) return annGenomic; + else { + StringBuilder b = new StringBuilder(); + + if ( RefSeqFeature.isExon(ann) ) { + if ( RefSeqFeature.isCodingExon(ann) ) b.append(annCoding); // both exon and coding = coding exon sequence + else b.append(annUTR); // exon but not coding = UTR + } else { + if ( RefSeqFeature.isCoding(ann) ) b.append(annIntron); // not in exon, but within the coding region = intron + else b.append(annUnknown); // we have no idea what this is. this may actually happen when we have a fully non-coding exon... + } + b.append('\t'); + b.append(((Transcript)ann.get(0).getUnderlyingObject()).getGeneName()); // there is at least one transcript in the list, guaranteed +// while ( it.hasNext() ) { // +// t.getGeneName() +// } + return b.toString(); + } + + } + + public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall) { + RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); + String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); + + StringBuilder fullRecord = new StringBuilder(); + fullRecord.append(makeFullRecord(normalCall)); + fullRecord.append(annotationString); + if ( ! normalCall.isCall() && normalCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); + try { + verboseWriter.write(fullRecord.toString()); + verboseWriter.write('\n'); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(verboseOutput, "Write failed", e); + } + + } + + + public void printVerboseLine(Writer verboseWriter, IndelPrecall normalCall, IndelPrecall tumorCall) { + RODRecordList annotationList = (refseqIterator == null ? null : refseqIterator.seekForward(location)); + String annotationString = (refseqIterator == null ? "" : getAnnotationString(annotationList)); + + StringBuilder fullRecord = new StringBuilder(); + fullRecord.append(makeFullRecord(normalCall,tumorCall)); + + if ( normalCall.getVariant() == null && tumorCall.getVariant() == null ) { + // did not observe anything + if ( normalCall.getCoverage() >= minNormalCoverage && tumorCall.getCoverage() >= minCoverage ) fullRecord.append("REFERENCE"); + else { + if ( tumorCall.getCoverage() >= minCoverage ) fullRecord.append("REFERENCE"); // no coverage in normal but nothing in tumor + else { + // no coverage in tumor; if we have no coverage in normal, it can be anything; if we do have coverage in normal, + // this still could be a somatic event. so either way it is 'unknown' + fullRecord.append("UNKNOWN"); + } + } + + } + + if ( normalCall.getVariant() == null && tumorCall.getVariant() != null ) { + // looks like somatic call + if ( normalCall.getCoverage() >= minNormalCoverage ) fullRecord.append("SOMATIC"); // we confirm there is nothing in normal + else { + // low coverage in normal + fullRecord.append("EVENT_T"); // no coverage in normal, no idea whether it is germline or somatic + } + } + + if ( normalCall.getVariant() != null && tumorCall.getVariant() == null ) { + // it's likely germline (with missing observation in tumor - maybe loh? + if ( tumorCall.getCoverage() >= minCoverage ) fullRecord.append("GERMLINE_LOH"); // we confirm there is nothing in tumor + else { + // low coverage in tumor, maybe we missed the event + fullRecord.append("GERMLINE"); // no coverage in tumor but we already saw it in normal... + } + } + + if ( normalCall.getVariant() != null && tumorCall.getVariant() != null ) { + // events in both T/N, got to be germline! + fullRecord.append("GERMLINE"); + } + + + fullRecord.append('\t'); + fullRecord.append(annotationString); + + if ( ! tumorCall.isCall() && tumorCall.getVariant() != null ) fullRecord.append("\tFILTERED_NOCALL"); + + try { + verboseWriter.write(fullRecord.toString()); + verboseWriter.write('\n'); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(verboseOutput, "Write failed", e); + } + } + + public void printVCFLine(VCFWriter vcf, IndelPrecall call) { + + long start = call.getPosition()-1; + // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed. + // The suggestion is instead of putting the base before the indel, to put the base after the indel. + // For now, just don't print out that site. + if ( start == 0 ) + return; + + long stop = start; + + List alleles = new ArrayList(2); // actual observed (distinct!) alleles at the site + List homref_alleles = null; // when needed, will contain two identical copies of ref allele - needed to generate hom-ref genotype + + + if ( call.getVariant() == null ) { + // we will need to cteate genotype with two (hom) ref alleles (below). + // we can not use 'alleles' list here, since that list is supposed to contain + // only *distinct* alleles observed at the site or VCFContext will frown upon us... + alleles.add( Allele.create(refBases[(int)start-1],true) ); + homref_alleles = new ArrayList(2); + homref_alleles.add( alleles.get(0)); + homref_alleles.add( alleles.get(0)); + } else { + // we always create alt allele when we observe anything but the ref, even if it is not a call! + // (Genotype will tell us whether it is an actual call or not!) + int event_length = call.getVariant().lengthOnRef(); + if ( event_length < 0 ) event_length = 0; + fillAlleleList(alleles,call); + stop += event_length; + } + + Map genotypes = new HashMap(); + + for ( String sample : normalSamples ) { + + Map attrs = call.makeStatsAttributes(null); + + if ( call.isCall() ) // we made a call - put actual het genotype here: + genotypes.put(sample,new Genotype(sample,alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false)); + else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all) + genotypes.put(sample,new Genotype(sample, homref_alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrs,false)); + + } + Set filters = null; + if ( call.getVariant() != null && ! call.isCall() ) { + filters = new HashSet(); + filters.add("NoCall"); + } + VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes, + -1.0 /* log error */, filters, null, refBases[(int)start-1]); + vcf.add(vc); + } + + /** Fills l with appropriate alleles depending on whether call is insertion or deletion + * (l MUST have a variant or this method will crash). It is guaranteed that the *first* allele added + * to the list is ref, and the next one is alt. + * @param l + * @param call + */ + private void fillAlleleList(List l, IndelPrecall call) { + int event_length = call.getVariant().lengthOnRef(); + if ( event_length == 0 ) { // insertion + + l.add( Allele.create(Allele.NULL_ALLELE_STRING,true) ); + l.add( Allele.create(call.getVariant().getBases(), false )); + + } else { //deletion: + l.add( Allele.create(call.getVariant().getBases(), true )); + l.add( Allele.create(Allele.NULL_ALLELE_STRING,false) ); + } + } + + public void printVCFLine(VCFWriter vcf, IndelPrecall nCall, IndelPrecall tCall) { + + long start = tCall.getPosition()-1; + long stop = start; + + // If the beginning of the chromosome is deleted (possible, however unlikely), it's unclear how to proceed. + // The suggestion is instead of putting the base before the indel, to put the base after the indel. + // For now, just don't print out that site. + if ( start == 0 ) + return; + + Map attrsNormal = nCall.makeStatsAttributes(null); + Map attrsTumor = tCall.makeStatsAttributes(null); + + Map attrs = new HashMap(); + + boolean isSomatic = false; + if ( nCall.getCoverage() >= minNormalCoverage && nCall.getVariant() == null && tCall.getVariant() != null ) { + isSomatic = true; + attrs.put(VCFConstants.SOMATIC_KEY,true); + } + List alleles = new ArrayList(2); // all alleles at the site + // List normal_alleles = null; // all alleles at the site + List homRefAlleles = null; + +// if ( nCall.getVariant() == null || tCall.getVariant() == null ) { + homRefAlleles = new ArrayList(2) ; // we need this for somatic calls (since normal is ref-ref), and also for no-calls +// } + boolean homRefT = ( tCall.getVariant() == null ); + boolean homRefN = ( nCall.getVariant() == null ); + if ( tCall.getVariant() == null && nCall.getVariant() == null) { + // no indel at all ; create base-representation ref/ref alleles for genotype construction + alleles.add( Allele.create(refBases[(int)start-1],true) ); + } else { + // we got indel(s) + int event_length = 0; + if ( tCall.getVariant() != null ) { + // indel in tumor + event_length = tCall.getVariant().lengthOnRef(); + fillAlleleList(alleles, tCall); + } else { + event_length = nCall.getVariant().lengthOnRef(); + fillAlleleList(alleles, nCall); + } + if ( event_length > 0 ) stop += event_length; + } + homRefAlleles.add( alleles.get(0)); + homRefAlleles.add( alleles.get(0)); + + Map genotypes = new HashMap(); + + for ( String sample : normalSamples ) { + genotypes.put(sample,new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsNormal,false)); + } + + for ( String sample : tumorSamples ) { + genotypes.put(sample,new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_NEG_LOG_10PERROR,null,attrsTumor,false) ); + } + + Set filters = null; + if ( tCall.getVariant() != null && ! tCall.isCall() ) { + filters = new HashSet(); + filters.add("NoCall"); + } + if ( nCall.getCoverage() < minNormalCoverage ) { + if ( filters == null ) filters = new HashSet(); + filters.add("NCov"); + } + if ( tCall.getCoverage() < minCoverage ) { + if ( filters == null ) filters = new HashSet(); + filters.add("TCov"); + } + + VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes, + -1.0 /* log error */, filters, attrs, refBases[(int)start-1]); + vcf.add(vc); + } + + @Override + public void onTraversalDone(Integer result) { + if ( DEBUG ) { + System.out.println("DEBUG>> Emitting last window at "+normal_context.getStart()+"-"+normal_context.getStop()); + } + if ( call_somatic ) emit_somatic(1000000000, true); + else emit(1000000000,true); // emit everything we might have left + + if ( metricsWriter != null ) { + metricsWriter.println(String.format("Normal calls made %d", normalCallsMade)); + metricsWriter.println(String.format("Tumor calls made %d", tumorCallsMade)); + metricsWriter.close(); + } + + try { + if ( bedWriter != null ) bedWriter.close(); + if ( verboseWriter != null ) verboseWriter.close(); + } catch (IOException e) { + System.out.println("Failed to close output BED file gracefully, data may be lost"); + e.printStackTrace(); + } + super.onTraversalDone(result); + } + + @Override + public Integer reduce(Integer value, Integer sum) { + if ( value == -1 ) { + onTraversalDone(sum); + System.exit(1); + } + sum += value; + return sum; + } + + @Override + public Integer reduceInit() { + return new Integer(0); + } + + + static class IndelVariant { + public static enum Type { I, D}; + private String bases; + private Type type; + private ArrayList fromStartOffsets = null; + private ArrayList fromEndOffsets = null; + + private Set reads = new HashSet(); // keep track of reads that have this indel + private Set samples = new HashSet(); // which samples had the indel described by this object + + public IndelVariant(ExpandedSAMRecord read , Type type, String bases) { + this.type = type; + this.bases = bases.toUpperCase(); + addObservation(read); + fromStartOffsets = new ArrayList(); + fromEndOffsets = new ArrayList(); + } + + /** Adds another observation for the current indel. It is assumed that the read being registered + * does contain the observation, no checks are performed. Read's sample is added to the list of samples + * this indel was observed in as well. + * @param read + */ + public void addObservation(ExpandedSAMRecord read) { + if ( reads.contains(read) ) { + //TODO fix CleanedReadInjector and reinstate exception here: duplicate records may signal a problem with the bam + // seeing the same read again can mean only one thing: the input bam file is corrupted and contains + // duplicate records. We KNOW that this may happen for the time being due to bug in CleanedReadInjector + // so this is a short-term patch: don't cry, but just ignore the duplicate record + + //throw new StingException("Attempting to add indel observation that was already registered"); + return; + } + reads.add(read); + String sample = null; + if ( read.getSAMRecord().getReadGroup() != null ) sample = read.getSAMRecord().getReadGroup().getSample(); + if ( sample != null ) samples.add(sample); + } + + + /** Returns length of the event on the reference (number of deleted bases + * for deletions, -1 for insertions. + * @return + */ + public int lengthOnRef() { + if ( type == Type.D ) return bases.length(); + else return 0; + } + + + public void addSample(String sample) { + if ( sample != null ) + samples.add(sample); + } + + public void addReadPositions(int fromStart, int fromEnd) { + fromStartOffsets.add(fromStart); + fromEndOffsets.add(fromEnd); + } + + public List getOffsetsFromStart() { return fromStartOffsets ; } + public List getOffsetsFromEnd() { return fromEndOffsets; } + + public String getSamples() { + StringBuffer sb = new StringBuffer(); + Iterator i = samples.iterator(); + while ( i.hasNext() ) { + sb.append(i.next()); + if ( i.hasNext() ) + sb.append(","); + } + return sb.toString(); + } + + public Set getReadSet() { return reads; } + + public int getCount() { return reads.size(); } + + public String getBases() { return bases; } + + public Type getType() { return type; } + + @Override + public boolean equals(Object o) { + if ( ! ( o instanceof IndelVariant ) ) return false; + IndelVariant that = (IndelVariant)o; + return ( this.type == that.type && this.bases.equals(that.bases) ); + } + + public boolean equals(Type type, String bases) { + return ( this.type == type && this.bases.equals(bases.toUpperCase()) ); + } + } + + /** + * Utility class that encapsulates the logic related to collecting all the stats and counts required to + * make (or discard) a call, as well as the calling heuristics that uses those data. + */ + class IndelPrecall { +// private boolean DEBUG = false; + private int NQS_MISMATCH_CUTOFF = 1000000; + private double AV_MISMATCHES_PER_READ = 1.5; + + private int nqs = 0; + private IndelVariant consensus_indel = null; // indel we are going to call + private long pos = -1 ; // position on the ref + private int total_coverage = 0; // total number of reads overlapping with the event + private int consensus_indel_count = 0; // number of reads, in which consensus indel was observed + private int all_indel_count = 0 ; // number of reads, in which any indel was observed at current position + + private int total_mismatches_in_nqs_window = 0; // total number of mismatches in the nqs window around the indel + private int total_bases_in_nqs_window = 0; // total number of bases in the nqs window (some reads may not fully span the window so it's not coverage*nqs_size) + private int total_base_qual_in_nqs_window = 0; // sum of qualitites of all the bases in the nqs window + private int total_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of all mismatching bases in the nqs window + + private int indel_read_mismatches_in_nqs_window = 0; // mismatches inside the nqs window in indel-containing reads only + private int indel_read_bases_in_nqs_window = 0; // number of bases in the nqs window from indel-containing reads only + private int indel_read_base_qual_in_nqs_window = 0; // sum of qualitites of bases in nqs window from indel-containing reads only + private int indel_read_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of mismatching bases in the nqs window from indel-containing reads only + + + private int consensus_indel_read_mismatches_in_nqs_window = 0; // mismatches within the nqs window from consensus indel reads only + private int consensus_indel_read_bases_in_nqs_window = 0; // number of bases in the nqs window from consensus indel-containing reads only + private int consensus_indel_read_base_qual_in_nqs_window = 0; // sum of qualitites of bases in nqs window from consensus indel-containing reads only + private int consensus_indel_read_mismatching_base_qual_in_nqs_window = 0; // sum of qualitites of mismatching bases in the nqs window from consensus indel-containing reads only + + + private double consensus_indel_read_total_mm = 0.0; // sum of all mismatches in reads that contain consensus indel + private double all_indel_read_total_mm = 0.0; // sum of all mismatches in reads that contain any indel at given position + private double all_read_total_mm = 0.0; // sum of all mismatches in all reads + + private double consensus_indel_read_total_mapq = 0.0; // sum of mapping qualitites of all reads with consensus indel + private double all_indel_read_total_mapq = 0.0 ; // sum of mapping qualitites of all reads with (any) indel at current position + private double all_read_total_mapq = 0.0; // sum of all mapping qualities of all reads + + private PrimitivePair.Int consensus_indel_read_orientation_cnt = new PrimitivePair.Int(); + private PrimitivePair.Int all_indel_read_orientation_cnt = new PrimitivePair.Int(); + private PrimitivePair.Int all_read_orientation_cnt = new PrimitivePair.Int(); + + private int from_start_median = 0; + private int from_start_mad = 0; + private int from_end_median = 0; + private int from_end_mad = 0; + + /** Makes an empty call (no-call) with all stats set to 0 + * + * @param position + */ + public IndelPrecall(long position) { + this.pos = position; + } + + public IndelPrecall(WindowContext context, long position, int nqs_width) { + this.pos = position; + this.nqs = nqs_width; + total_coverage = context.coverageAt(pos,true); + List variants = context.indelsAt(pos); + findConsensus(variants); + + // pos is the first base after the event: first deleted base or first base after insertion. + // hence, [pos-nqs, pos+nqs-1] (inclusive) is the window with nqs bases on each side of a no-event or an insertion + // and [pos-nqs, pos+Ndeleted+nqs-1] is the window with nqs bases on each side of a deletion. + // we initialize the nqs window for no-event/insertion case + long left = Math.max( pos-nqs, context.getStart() ); + long right = Math.min(pos+nqs-1, context.getStop()); +//if ( pos == 3534096 ) System.out.println("pos="+pos +" total reads: "+context.getReads().size()); + Iterator read_iter = context.getReads().iterator(); + + + while ( read_iter.hasNext() ) { + ExpandedSAMRecord rec = read_iter.next(); + SAMRecord read = rec.getSAMRecord(); + byte[] flags = rec.getExpandedMMFlags(); + byte[] quals = rec.getExpandedQuals(); + int mm = rec.getMMCount(); + + + if( read.getAlignmentStart() > pos || read.getAlignmentEnd() < pos ) continue; + + long local_right = right; // end of nqs window for this particular read. May need to be advanced further right + // if read has a deletion. The gap in the middle of nqs window will be skipped + // automatically since flags/quals are set to -1 there + + boolean read_has_a_variant = false; + boolean read_has_consensus = ( consensus_indel!= null && consensus_indel.getReadSet().contains(rec) ); + for ( IndelVariant v : variants ) { + if ( v.getReadSet().contains(rec) ) { + read_has_a_variant = true; + local_right += v.lengthOnRef(); + break; + } + } + + if ( read_has_consensus ) { + consensus_indel_read_total_mm += mm; + consensus_indel_read_total_mapq += read.getMappingQuality(); + if ( read.getReadNegativeStrandFlag() ) consensus_indel_read_orientation_cnt.second++; + else consensus_indel_read_orientation_cnt.first++; + } + if ( read_has_a_variant ) { + all_indel_read_total_mm += mm; + all_indel_read_total_mapq += read.getMappingQuality(); + if ( read.getReadNegativeStrandFlag() ) all_indel_read_orientation_cnt.second++; + else all_indel_read_orientation_cnt.first++; + } + + all_read_total_mm+= mm; + all_read_total_mapq += read.getMappingQuality(); + if ( read.getReadNegativeStrandFlag() ) all_read_orientation_cnt.second++; + else all_read_orientation_cnt.first++; + + for ( int pos_in_flags = Math.max((int)(left - read.getAlignmentStart()),0); + pos_in_flags <= Math.min((int)local_right-read.getAlignmentStart(),flags.length - 1); + pos_in_flags++) { + + if ( flags[pos_in_flags] == -1 ) continue; // gap (deletion), skip it; we count only bases aligned to the ref + total_bases_in_nqs_window++; + if ( read_has_consensus ) consensus_indel_read_bases_in_nqs_window++; + if ( read_has_a_variant ) indel_read_bases_in_nqs_window++; + + if ( quals[pos_in_flags] != -1 ) { + + total_base_qual_in_nqs_window += quals[pos_in_flags]; + if ( read_has_a_variant ) indel_read_base_qual_in_nqs_window += quals[pos_in_flags]; + if ( read_has_consensus ) consensus_indel_read_base_qual_in_nqs_window += quals[pos_in_flags]; + } + + if ( flags[pos_in_flags] == 1 ) { // it's a mismatch + total_mismatches_in_nqs_window++; + total_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; + + if ( read_has_consensus ) { + consensus_indel_read_mismatches_in_nqs_window++; + consensus_indel_read_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; + } + + if ( read_has_a_variant ) { + indel_read_mismatches_in_nqs_window++; + indel_read_mismatching_base_qual_in_nqs_window += quals[pos_in_flags]; + } + } + } +// if ( pos == 3534096 ) { +// System.out.println(read.getReadName()); +// System.out.println(" cons nqs bases="+consensus_indel_read_bases_in_nqs_window); +// System.out.println(" qual sum="+consensus_indel_read_base_qual_in_nqs_window); +// } + + } + + // compute median/mad for offsets from the read starts/ends + if ( consensus_indel != null ) { + from_start_median = median(consensus_indel.getOffsetsFromStart()) ; + from_start_mad = mad(consensus_indel.getOffsetsFromStart(),from_start_median); + from_end_median = median(consensus_indel.getOffsetsFromEnd()) ; + from_end_mad = mad(consensus_indel.getOffsetsFromEnd(),from_end_median); + } + } + + /** As a side effect will sort l! + * + * @param l + * @return + */ + private int median(List l) { + Collections.sort(l); + int k = l.size()/2; + return ( l.size() % 2 == 0 ? + (l.get(k-1).intValue()+l.get(k).intValue())/2 : + l.get(k).intValue()); + } + + private int median(int[] l) { + Arrays.sort(l); + int k = l.length/2; + return ( l.length % 2 == 0 ? + (l[k-1]+l[k])/2 : + l[k]); + } + + private int mad(List l, int med) { + int [] diff = new int[l.size()]; + for ( int i = 0; i < l.size(); i++ ) { + diff[i] = Math.abs(l.get(i).intValue() - med); + } + return median(diff); + } + + public long getPosition() { return pos; } + + public boolean hasObservation() { return consensus_indel != null; } + + public int getCoverage() { return total_coverage; } + + public double getTotalMismatches() { return all_read_total_mm; } + public double getConsensusMismatches() { return consensus_indel_read_total_mm; } + public double getAllVariantMismatches() { return all_indel_read_total_mm; } + + /** Returns average number of mismatches per consensus indel-containing read */ + public double getAvConsensusMismatches() { + return ( consensus_indel_count != 0 ? consensus_indel_read_total_mm/consensus_indel_count : 0.0 ); + } + + /** Returns average number of mismatches per read across all reads matching the ref (not containing any indel variants) */ + public double getAvRefMismatches() { + int coverage_ref = total_coverage-all_indel_count; + return ( coverage_ref != 0 ? (all_read_total_mm - all_indel_read_total_mm )/coverage_ref : 0.0 ); + } + + public PrimitivePair.Int getConsensusStrandCounts() { + return consensus_indel_read_orientation_cnt; + } + + public PrimitivePair.Int getRefStrandCounts() { + return new PrimitivePair.Int(all_read_orientation_cnt.first-all_indel_read_orientation_cnt.first, + all_read_orientation_cnt.second - all_indel_read_orientation_cnt.second); + } + + /** Returns a sum of mapping qualities of all reads spanning the event. */ + public double getTotalMapq() { return all_read_total_mapq; } + + /** Returns a sum of mapping qualities of all reads, in which the consensus variant is observed. */ + public double getConsensusMapq() { return consensus_indel_read_total_mapq; } + + /** Returns a sum of mapping qualities of all reads, in which any variant is observed at the current event site. */ + public double getAllVariantMapq() { return all_indel_read_total_mapq; } + + /** Returns average mapping quality per consensus indel-containing read. */ + public double getAvConsensusMapq() { + return ( consensus_indel_count != 0 ? consensus_indel_read_total_mapq/consensus_indel_count : 0.0 ); + } + + /** Returns average number of mismatches per read across all reads matching the ref (not containing any indel variants). */ + public double getAvRefMapq() { + int coverage_ref = total_coverage-all_indel_count; + return ( coverage_ref != 0 ? (all_read_total_mapq - all_indel_read_total_mapq )/coverage_ref : 0.0 ); + } + + /** Returns fraction of bases in NQS window around the indel that are mismatches, across all reads, + * in which consensus indel is observed. NOTE: NQS window for indel containing reads is defined around + * the indel itself (e.g. for a 10-base deletion spanning [X,X+9], the 5-NQS window is {[X-5,X-1],[X+10,X+15]} + * */ + public double getNQSConsensusMMRate() { + if ( consensus_indel_read_bases_in_nqs_window == 0 ) return 0; + return ((double)consensus_indel_read_mismatches_in_nqs_window)/consensus_indel_read_bases_in_nqs_window; + } + + /** Returns fraction of bases in NQS window around the indel start position that are mismatches, across all reads + * that align to the ref (i.e. contain no indel observation at the current position). NOTE: NQS window for ref + * reads is defined around the event start position, NOT around the actual consensus indel. + * */ + public double getNQSRefMMRate() { + int num_ref_bases = total_bases_in_nqs_window - indel_read_bases_in_nqs_window; + if ( num_ref_bases == 0 ) return 0; + return ((double)(total_mismatches_in_nqs_window - indel_read_mismatches_in_nqs_window))/num_ref_bases; + } + + /** Returns average base quality in NQS window around the indel, across all reads, + * in which consensus indel is observed. NOTE: NQS window for indel containing reads is defined around + * the indel itself (e.g. for a 10-base deletion spanning [X,X+9], the 5-NQS window is {[X-5,X-1],[X+10,X+15]} + * */ + public double getNQSConsensusAvQual() { + if ( consensus_indel_read_bases_in_nqs_window == 0 ) return 0; + return ((double)consensus_indel_read_base_qual_in_nqs_window)/consensus_indel_read_bases_in_nqs_window; + } + + /** Returns fraction of bases in NQS window around the indel start position that are mismatches, across all reads + * that align to the ref (i.e. contain no indel observation at the current position). NOTE: NQS window for ref + * reads is defined around the event start position, NOT around the actual consensus indel. + * */ + public double getNQSRefAvQual() { + int num_ref_bases = total_bases_in_nqs_window - indel_read_bases_in_nqs_window; + if ( num_ref_bases == 0 ) return 0; + return ((double)(total_base_qual_in_nqs_window - indel_read_base_qual_in_nqs_window))/num_ref_bases; + } + + public int getTotalNQSMismatches() { return total_mismatches_in_nqs_window; } + + public int getAllVariantCount() { return all_indel_count; } + public int getConsensusVariantCount() { return consensus_indel_count; } + +// public boolean failsNQSMismatch() { +// //TODO wrong fraction: mismatches are counted only in indel-containing reads, but total_coverage is used! +// return ( indel_read_mismatches_in_nqs_window > NQS_MISMATCH_CUTOFF ) || +// ( indel_read_mismatches_in_nqs_window > total_coverage * AV_MISMATCHES_PER_READ ); +// } + + public IndelVariant getVariant() { return consensus_indel; } + + public boolean isCall() { + boolean ret = ( consensus_indel_count >= minIndelCount && + (double)consensus_indel_count > minFraction * total_coverage && + (double) consensus_indel_count > minConsensusFraction*all_indel_count && total_coverage >= minCoverage); + if ( DEBUG && ! ret ) System.out.println("DEBUG>> NOT a call: count="+consensus_indel_count+ + " total_count="+all_indel_count+" cov="+total_coverage+ + " minConsensusF="+((double)consensus_indel_count)/all_indel_count+ + " minF="+((double)consensus_indel_count)/total_coverage); + return ret; + + } + + /** Utility method: finds the indel variant with the largest count (ie consensus) among all the observed + * variants, and sets the counts of consensus observations and all observations of any indels (including non-consensus) + * @param variants + * @return + */ + private void findConsensus(List variants) { + for ( IndelVariant var : variants ) { + if ( DEBUG ) System.out.println("DEBUG>> Variant "+var.getBases()+" (cnt="+var.getCount()+")"); + int cnt = var.getCount(); + all_indel_count +=cnt; + if ( cnt > consensus_indel_count ) { + consensus_indel = var; + consensus_indel_count = cnt; + } + } + if ( DEBUG && consensus_indel != null ) System.out.println("DEBUG>> Returning: "+consensus_indel.getBases()+ + " (cnt="+consensus_indel.getCount()+") with total count of "+all_indel_count); + } + + + + public void printBedLine(Writer bed) { + int event_length; + if ( consensus_indel == null ) event_length = 0; + else { + event_length = consensus_indel.lengthOnRef(); + if ( event_length < 0 ) event_length = 0; + } + + StringBuffer message = new StringBuffer(); + message.append(refName+"\t"+(pos-1)+"\t"); + message.append((pos-1+event_length)+"\t"); + if ( consensus_indel != null ) { + message.append((event_length>0? "-":"+")+consensus_indel.getBases()); + } else { + message.append('.'); + } + message.append(":"+all_indel_count+"/"+total_coverage); + try { + bed.write(message.toString()+"\n"); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(bedOutput, "Error encountered while writing into output BED file", e); + } + } + + public String makeEventString() { + int event_length; + if ( consensus_indel == null ) event_length = 0; + else { + event_length = consensus_indel.lengthOnRef(); + if ( event_length < 0 ) event_length = 0; + } + StringBuffer message = new StringBuffer(); + message.append(refName); + message.append('\t'); + message.append(pos-1); + message.append('\t'); + message.append(pos-1+event_length); + message.append('\t'); + if ( consensus_indel != null ) { + message.append((event_length>0?'-':'+')); + message.append(consensus_indel.getBases()); + } else { + message.append('.'); + } + return message.toString(); + } + + public String makeStatsString(String prefix) { + StringBuilder message = new StringBuilder(); + message.append(prefix+"OBS_COUNTS[C/A/T]:"+getConsensusVariantCount()+"/"+getAllVariantCount()+"/"+getCoverage()); + message.append('\t'); + message.append(prefix+"AV_MM[C/R]:"+String.format("%.2f/%.2f",getAvConsensusMismatches(), + getAvRefMismatches())); + message.append('\t'); + message.append(prefix+"AV_MAPQ[C/R]:"+String.format("%.2f/%.2f",getAvConsensusMapq(), + getAvRefMapq())); + message.append('\t'); + message.append(prefix+"NQS_MM_RATE[C/R]:"+String.format("%.2f/%.2f",getNQSConsensusMMRate(),getNQSRefMMRate())); + message.append('\t'); + message.append(prefix+"NQS_AV_QUAL[C/R]:"+String.format("%.2f/%.2f",getNQSConsensusAvQual(),getNQSRefAvQual())); + + PrimitivePair.Int strand_cons = getConsensusStrandCounts(); + PrimitivePair.Int strand_ref = getRefStrandCounts(); + message.append('\t'); + message.append(prefix+"STRAND_COUNTS[C/C/R/R]:"+strand_cons.first+"/"+strand_cons.second+"/"+strand_ref.first+"/"+strand_ref.second); + + message.append('\t'); + message.append(prefix+"OFFSET_RSTART:"+from_start_median+"/"+from_start_mad); + message.append('\t'); + message.append(prefix+"OFFSET_REND:"+from_end_median+"/"+from_end_mad); + + return message.toString(); + + } + + /** + * Places alignment statistics into attribute map and returns the map. If attr parameter is null, + * a new map is allocated, filled and returned. If attr is not null, new attributes are added to that + * preexisting map, and the same instance of the (updated) map is returned. + * + * @param attr + * @return + */ + public Map makeStatsAttributes(Map attr) { + if ( attr == null ) attr = new HashMap(); + + VCFIndelAttributes.recordDepth(getConsensusVariantCount(),getAllVariantCount(),getCoverage(),attr); + + VCFIndelAttributes.recordAvMM(getAvConsensusMismatches(),getAvRefMismatches(),attr); + + VCFIndelAttributes.recordAvMapQ(getAvConsensusMapq(),getAvRefMapq(),attr); + + VCFIndelAttributes.recordNQSMMRate(getNQSConsensusMMRate(),getNQSRefMMRate(),attr); + + VCFIndelAttributes.recordNQSAvQ(getNQSConsensusAvQual(),getNQSRefAvQual(),attr); + + VCFIndelAttributes.recordOffsetFromStart(from_start_median,from_start_mad,attr); + + VCFIndelAttributes.recordOffsetFromEnd(from_end_median,from_end_mad,attr); + + PrimitivePair.Int strand_cons = getConsensusStrandCounts(); + PrimitivePair.Int strand_ref = getRefStrandCounts(); + + VCFIndelAttributes.recordStrandCounts(strand_cons.first,strand_cons.second,strand_ref.first,strand_ref.second,attr); + return attr; + } + } + + interface IndelListener { + public void addObservation(int pos, IndelVariant.Type t, String bases, int fromStart, int fromEnd, ExpandedSAMRecord r); + } + + class WindowContext implements IndelListener { + private Set reads; + private int start=0; // where the window starts on the ref, 1-based + private CircularArray< List< IndelVariant > > indels; + + private List emptyIndelList = new ArrayList(); + + + public WindowContext(int start, int length) { + this.start = start; + indels = new CircularArray< List >(length); +// reads = new LinkedList(); + reads = new HashSet(); + } + + /** Returns 1-based reference start position of the interval this object keeps context for. + * + * @return + */ + public int getStart() { return start; } + + /** Returns 1-based reference stop position (inclusive) of the interval this object keeps context for. + * + * @return + */ + public int getStop() { return start + indels.length() - 1; } + + /** Resets reference start position to 0 and clears the context. + * + */ + public void clear() { + start = 0; + reads.clear(); + indels.clear(); + } + + /** + * Returns true if any indel observations are present in the specified interval + * [begin,end] (1-based, inclusive). Interval can be partially of fully outside of the + * current context window: positions outside of the window will be ignored. + * @param begin + * @param end + */ + public boolean hasIndelsInInterval(long begin, long end) { + for ( long k = Math.max(start,begin); k < Math.min(getStop(),end); k++ ) { + if ( indelsAt(k) != emptyIndelList ) return true; + } + return false; + } + + public Set getReads() { return reads; } + + /** Returns the number of reads spanning over the specified reference position + * (regardless of whether they have a base or indel at that specific location). + * The second argument controls whether to count with indels in mind (this is relevant for insertions only, + * deletions do not require any special treatment since they occupy non-zero length on the ref and since + * alignment can not start or end with a deletion). For insertions, note that, internally, we assign insertions + * to the reference position right after the actual event, and we count all events assigned to a given position. + * This count (reads with indels) should be contrasted to reads without indels, or more rigorously, reads + * that support the ref rather than the indel. Few special cases may occur here: + * 1) an alignment that ends (as per getAlignmentEnd()) right before the current position but has I as its + * last element: we have to count that read into the "coverage" at the current position for the purposes of indel + * assessment, as the indel in that read will be counted at the current position, so the total coverage + * should be consistent with that. + */ + /* NOT IMPLEMENTED: 2) alsignments that start exactly at the current position do not count for the purpose of insertion + * assessment since they do not contribute any evidence to either Ref or Alt=insertion hypothesis, unless + * the alignment starts with I (so that we do have evidence for an indel assigned to the current position and + * read should be counted). For deletions, reads starting at the current position should always be counted (as they + * show no deletion=ref). + * @param refPos position on the reference; must be within the bounds of the window + */ + public int coverageAt(final long refPos, boolean countForIndels) { + int cov = 0; + for ( ExpandedSAMRecord read : reads ) { + if ( read.getSAMRecord().getAlignmentStart() > refPos || read.getSAMRecord().getAlignmentEnd() < refPos ) { + if ( countForIndels && read.getSAMRecord().getAlignmentEnd() == refPos - 1) { + Cigar c = read.getSAMRecord().getCigar(); + if ( c.getCigarElement(c.numCigarElements()-1).getOperator() == CigarOperator.I ) cov++; + } + continue; + } + cov++; + } + return cov; + } + + + /** Shifts current window to the right along the reference contig by the specified number of bases. + * The context will be updated accordingly (indels and reads that go out of scope will be dropped). + * @param offset + */ + public void shift(int offset) { + start += offset; + + indels.shiftData(offset); + if ( indels.get(0) != null && indels.get(0).size() != 0 ) { + IndelVariant indel = indels.get(0).get(0); + + System.out.println("WARNING: Indel(s) at first position in the window ("+refName+":"+start+"): currently not supported: "+ + (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; read: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName()+"; site ignored"); + indels.get(0).clear(); +// throw new StingException("Indel found at the first position ("+start+") after a shift was performed: currently not supported: "+ +// (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; reads: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName()); + } + + Iterator read_iter = reads.iterator(); + + while ( read_iter.hasNext() ) { + ExpandedSAMRecord r = read_iter.next(); + if ( r.getSAMRecord().getAlignmentEnd() < start ) { // discard reads and associated data that went out of scope + read_iter.remove(); + } + } + } + + public void add(SAMRecord read, byte [] ref) { + + if ( read.getAlignmentStart() < start ) return; // silently ignore reads starting before the window start + + ExpandedSAMRecord er = new ExpandedSAMRecord(read,ref,read.getAlignmentStart()-start,this); + //TODO duplicate records may actually indicate a problem with input bam file; throw an exception when the bug in CleanedReadInjector is fixed + if ( reads.contains(er)) return; // ignore duplicate records + reads.add(er); + } + + public void addObservation(int pos, IndelVariant.Type type, String bases, int fromStart, int fromEnd, ExpandedSAMRecord rec) { + List indelsAtSite; + try { + indelsAtSite = indels.get(pos); + } catch (IndexOutOfBoundsException e) { + SAMRecord r = rec.getSAMRecord(); + System.out.println("Failed to add indel observation, probably out of coverage window bounds (trailing indel?):\nRead "+ + r.getReadName()+": "+ + "read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+ + r.getAlignmentStart()+"; end="+r.getAlignmentEnd()+"; window start="+getStart()+ + "; window end="+getStop()); + throw e; + } + + if ( indelsAtSite == null ) { + indelsAtSite = new ArrayList(); + indels.set(pos, indelsAtSite); + } + + IndelVariant indel = null; + for ( IndelVariant v : indelsAtSite ) { + if ( ! v.equals(type, bases) ) continue; + + indel = v; + indel.addObservation(rec); + break; + } + + if ( indel == null ) { // not found: + indel = new IndelVariant(rec, type, bases); + indelsAtSite.add(indel); + } + indel.addReadPositions(fromStart,fromEnd); + } + + public List indelsAt( final long refPos ) { + List l = indels.get((int)( refPos - start )); + if ( l == null ) return emptyIndelList; + else return l; + } + + + } + + + class ExpandedSAMRecord { + private SAMRecord read; + private byte[] mismatch_flags; + private byte[] expanded_quals; + private int mms; + + public ExpandedSAMRecord(SAMRecord r, byte [] ref, long offset, IndelListener l) { + + read = r; + final long rStart = read.getAlignmentStart(); + final long rStop = read.getAlignmentEnd(); + final byte[] readBases = read.getReadString().toUpperCase().getBytes(); + + ref = new String(ref).toUpperCase().getBytes(); + + mismatch_flags = new byte[(int)(rStop-rStart+1)]; + expanded_quals = new byte[(int)(rStop-rStart+1)]; + + // now let's extract indels: + + Cigar c = read.getCigar(); + final int nCigarElems = c.numCigarElements(); + + + int readLength = 0; // length of the aligned part of the read NOT counting clipped bases + for ( CigarElement cel : c.getCigarElements() ) { + + switch(cel.getOperator()) { + case H: + case S: + case D: + case N: + case P: + break; // do not count gaps or clipped bases + case I: + case M: + readLength += cel.getLength(); + break; // advance along the gapless block in the alignment + default : + throw new IllegalArgumentException("Unexpected operator in cigar string: "+cel.getOperator()); + } + } + + int fromStart = 0; + int posOnRead = 0; + int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read: + // its start on the actual full reference contig is r.getAlignmentStart() + for ( int i = 0 ; i < nCigarElems ; i++ ) { + + final CigarElement ce = c.getCigarElement(i); + IndelVariant.Type type = null; + String indel_bases = null; + int eventPosition = posOnRef; + + switch(ce.getOperator()) { + case H: break; // hard clipped reads do not have clipped indel_bases in their sequence, so we just ignore the H element... + case I: + type = IndelVariant.Type.I; + indel_bases = read.getReadString().substring(posOnRead,posOnRead+ce.getLength()); + // will increment position on the read below, there's no 'break' statement yet... + case S: + // here we also skip soft-clipped indel_bases on the read; according to SAM format specification, + // alignment start position on the reference points to where the actually aligned + // (not clipped) indel_bases go, so we do not need to increment reference position here + posOnRead += ce.getLength(); + break; + case D: + type = IndelVariant.Type.D; + indel_bases = new String( ref, posOnRef, ce.getLength() ); + for( int k = 0 ; k < ce.getLength(); k++, posOnRef++ ) mismatch_flags[posOnRef] = expanded_quals[posOnRef] = -1; + + break; + case M: + for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) { + if ( readBases[posOnRead] != ref[posOnRef] ) { // mismatch! + mms++; + mismatch_flags[posOnRef] = 1; + } + expanded_quals[posOnRef] = read.getBaseQualities()[posOnRead]; + } + fromStart += ce.getLength(); + break; // advance along the gapless block in the alignment + default : + throw new IllegalArgumentException("Unexpected operator in cigar string: "+ce.getOperator()); + } + + if ( type == null ) continue; // element was not an indel, go grab next element... + + // we got an indel if we are here... + if ( i == 0 ) logger.debug("Indel at the start of the read "+read.getReadName()); + if ( i == nCigarElems - 1) logger.debug("Indel at the end of the read "+read.getReadName()); + + // note that here we will be assigning indels to the first deleted base or to the first + // base after insertion, not to the last base before the event! + int fromEnd = readLength - fromStart; + if ( type == IndelVariant.Type.I ) fromEnd -= ce.getLength(); + + l.addObservation((int)(offset+eventPosition), type, indel_bases, fromStart, fromEnd, this); + + if ( type == IndelVariant.Type.I ) fromStart += ce.getLength(); + + } + } + + public SAMRecord getSAMRecord() { return read; } + + public byte [] getExpandedMMFlags() { return mismatch_flags; } + + public byte [] getExpandedQuals() { return expanded_quals; } + + public int getMMCount() { return mms; } + + public boolean equals(Object o) { + if ( this == o ) return true; + if ( read == null ) return false; + if ( o instanceof SAMRecord ) return read.equals(o); + if ( o instanceof ExpandedSAMRecord ) return read.equals(((ExpandedSAMRecord)o).read); + return false; + } + + + } + +} + + +class VCFIndelAttributes { + public static String ALLELIC_DEPTH_KEY = "AD"; + public static String DEPTH_TOTAL_KEY = VCFConstants.DEPTH_KEY; + + public static String MAPQ_KEY = "MQS"; + + public static String MM_KEY = "MM"; + + public static String NQS_MMRATE_KEY = "NQSMM"; + + public static String NQS_AVQ_KEY = "NQSBQ"; + + public static String STRAND_COUNT_KEY = "SC"; + public static String RSTART_OFFSET_KEY = "RStart"; + public static String REND_OFFSET_KEY = "REnd"; + + public static Set getAttributeHeaderLines() { + Set lines = new HashSet(); + + lines.add(new VCFFormatHeaderLine(ALLELIC_DEPTH_KEY, 2, VCFHeaderLineType.Integer, "# of reads supporting consensus indel/reference at the site")); + lines.add(new VCFFormatHeaderLine(DEPTH_TOTAL_KEY, 1, VCFHeaderLineType.Integer, "Total coverage at the site")); + + lines.add(new VCFFormatHeaderLine(MAPQ_KEY, 2, VCFHeaderLineType.Float, "Average mapping qualities of consensus indel-supporting reads/reference-supporting reads")); + + lines.add(new VCFFormatHeaderLine(MM_KEY, 2, VCFHeaderLineType.Float, "Average # of mismatches per consensus indel-supporting read/per reference-supporting read")); + + lines.add(new VCFFormatHeaderLine(NQS_MMRATE_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: fraction of mismatching bases in consensus indel-supporting reads/in reference-supporting reads")); + + lines.add(new VCFFormatHeaderLine(NQS_AVQ_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: average quality of bases from consensus indel-supporting reads/from reference-supporting reads")); + + lines.add(new VCFFormatHeaderLine(STRAND_COUNT_KEY, 4, VCFHeaderLineType.Integer, "Strandness: counts of forward-/reverse-aligned indel-supporting reads / forward-/reverse-aligned reference supporting reads")); + + lines.add(new VCFFormatHeaderLine(RSTART_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the starts of the reads")); + lines.add(new VCFFormatHeaderLine(REND_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the ends of the reads")); + + return lines; + } + + public static Map recordStrandCounts(int cnt_cons_fwd, int cnt_cons_rev, int cnt_ref_fwd, int cnt_ref_rev, Map attrs) { + attrs.put(STRAND_COUNT_KEY, new Integer[] {cnt_cons_fwd, cnt_cons_rev, cnt_ref_fwd, cnt_ref_rev} ); + return attrs; + } + + public static Map recordDepth(int cnt_cons, int cnt_indel, int cnt_total, Map attrs) { + attrs.put(ALLELIC_DEPTH_KEY, new Integer[] {cnt_cons, cnt_indel} ); + attrs.put(DEPTH_TOTAL_KEY, cnt_total); + return attrs; + } + + public static Map recordAvMapQ(double cons, double ref, Map attrs) { + attrs.put(MAPQ_KEY, new Float[] {(float)cons, (float)ref} ); + return attrs; + } + + public static Map recordAvMM(double cons, double ref, Map attrs) { + attrs.put(MM_KEY, new Float[] {(float)cons, (float)ref} ); + return attrs; + } + + public static Map recordNQSMMRate(double cons, double ref, Map attrs) { + attrs.put(NQS_MMRATE_KEY, new Float[] {(float)cons, (float)ref} ); + return attrs; + } + + public static Map recordNQSAvQ(double cons, double ref, Map attrs) { + attrs.put(NQS_AVQ_KEY, new Float[] {(float)cons, (float)ref} ); + return attrs; + } + + public static Map recordOffsetFromStart(int median, int mad, Map attrs) { + attrs.put(RSTART_OFFSET_KEY, new Integer[] {median, mad} ); + return attrs; + } + + public static Map recordOffsetFromEnd(int median, int mad, Map attrs) { + attrs.put(REND_OFFSET_KEY, new Integer[] {median, mad} ); + return attrs; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java index 315ae36d8..489e8609a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java @@ -91,7 +91,7 @@ public class MergeAndMatchHaplotypes extends RodWalker { } VariantContext newvc = new VariantContext(SOURCE_NAME, pbt.getChr(), pbt.getStart(), pbt.getStart(), pbt.getAlleles(), genotypes, pbt.getNegLog10PError(), pbt.getFilters(), pbt.getAttributes()); - vcfWriter.add(newvc, ref.getBase()); + vcfWriter.add(newvc); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java index b0491a281..53cfaa3a9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java @@ -118,7 +118,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { innerWriter.close(); } - public void add(VariantContext vc, byte refBase) { + public void add(VariantContext vc) { if (useSingleSample != null) { // only want to output context for one sample Genotype sampGt = vc.getGenotype(useSingleSample); if (sampGt != null) // TODO: subContextFromGenotypes() does not handle any INFO fields [AB, HaplotypeScore, MQ, etc.]. Note that even SelectVariants.subsetRecord() only handles AC,AN,AF, and DP! @@ -138,11 +138,11 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { if (curVcIsNotFiltered) { // still need to wait before can release vc logger.debug("Waiting for new variant " + VariantContextUtils.getLocation(genomeLocParser, vc)); - vcfrWaitingToMerge = new VCFRecord(vc, refBase, false); + vcfrWaitingToMerge = new VCFRecord(vc, false); } else if (!emitOnlyMergedRecords) { // filtered records are never merged logger.debug("DIRECTLY output " + VariantContextUtils.getLocation(genomeLocParser, vc)); - innerWriter.add(vc, refBase); + innerWriter.add(vc); } } else { // waiting to merge vcfrWaitingToMerge @@ -151,7 +151,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { if (!curVcIsNotFiltered) { if (!emitOnlyMergedRecords) { // filtered records are never merged logger.debug("Caching unprocessed output " + VariantContextUtils.getLocation(genomeLocParser, vc)); - filteredVcfrList.add(new VCFRecord(vc, refBase, false)); + filteredVcfrList.add(new VCFRecord(vc, false)); } } else { // waiting to merge vcfrWaitingToMerge, and curVcIsNotFiltered. So, attempt to merge them: @@ -188,14 +188,14 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { addedAttribs.putAll(mergedVc.getAttributes()); mergedVc = VariantContext.modifyAttributes(mergedVc, addedAttribs); - vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase, true); + vcfrWaitingToMerge = new VCFRecord(mergedVc, true); numMergedRecords++; } } if (!mergedRecords) { stopWaitingToMerge(); - vcfrWaitingToMerge = new VCFRecord(vc, refBase, false); + vcfrWaitingToMerge = new VCFRecord(vc, false); } logger.debug("Merged? = " + mergedRecords); } @@ -210,11 +210,11 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { } if (!emitOnlyMergedRecords || vcfrWaitingToMerge.resultedFromMerge) - innerWriter.add(vcfrWaitingToMerge.vc, vcfrWaitingToMerge.refBase); + innerWriter.add(vcfrWaitingToMerge.vc); vcfrWaitingToMerge = null; for (VCFRecord vcfr : filteredVcfrList) - innerWriter.add(vcfr.vc, vcfr.refBase); + innerWriter.add(vcfr.vc); filteredVcfrList.clear(); } @@ -257,12 +257,10 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { private static class VCFRecord { public VariantContext vc; - public byte refBase; public boolean resultedFromMerge; - public VCFRecord(VariantContext vc, byte refBase, boolean resultedFromMerge) { + public VCFRecord(VariantContext vc, boolean resultedFromMerge) { this.vc = vc; - this.refBase = refBase; this.resultedFromMerge = resultedFromMerge; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index adad4b20d..fab4f2253 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -311,7 +311,8 @@ public class PhaseByTransmission extends RodWalker { VariantContext newvc = VariantContext.modifyGenotypes(vc, genotypeMap); - vcfWriter.add(newvc, ref.getBase()); + vcfWriter.add(newvc); + } } return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java index 2851ace0d..c10eaa2da 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java @@ -25,20 +25,10 @@ package org.broadinstitute.sting.gatk.walkers.phasing; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; public class WriteVCF { public static void writeVCF(VariantContext vc, VCFWriter writer, Logger logger) { - byte refBase; - if (!vc.isIndel()) { - Allele refAllele = vc.getReference(); - refBase = SNPallelePair.getSingleBase(refAllele); - } - else { - refBase = vc.getReferenceBaseForIndel(); - } - - writer.add(vc, refBase); + writer.add(vc); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 1415db87c..74b7b8e7d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -204,9 +204,9 @@ public class ApplyRecalibration extends RodWalker { filters.add(filterString); vc = VariantContext.modifyFilters(vc, filters); } - vcfWriter.add( VariantContext.modifyPErrorFiltersAndAttributes(vc, vc.getNegLog10PError(), vc.getFilters(), attrs), ref.getBase() ); + vcfWriter.add( VariantContext.modifyPErrorFiltersAndAttributes(vc, vc.getNegLog10PError(), vc.getFilters(), attrs) ); } else { // valid VC but not compatible with this mode, so just emit the variant untouched - vcfWriter.add( vc, ref.getBase() ); + vcfWriter.add( vc ); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index e7f74de0d..28231ab1b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -258,7 +258,7 @@ public class VariantDataManager { datum.consensusCount = 0; for( final TrainingSet trainingSet : trainingSets ) { - for( final VariantContext trainVC : tracker.getValues(VariantContext.class, trainingSet.name) ) { + for( final VariantContext trainVC : tracker.getValues(VariantContext.class, trainingSet.name, ref.getLocus()) ) { if( trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 6d1d75c03..e918d5ce8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -156,7 +156,7 @@ public class CombineVariants extends RodWalker { if ( ASSUME_IDENTICAL_SAMPLES ) { for ( final VariantContext vc : vcs ) { - vcfWriter.add( vc, ref.getBase() ); + vcfWriter.add(vc); } return vcs.isEmpty() ? 0 : 1; @@ -181,7 +181,7 @@ public class CombineVariants extends RodWalker { if ( VCsByType.containsKey(type) ) mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - ref.getBase(), SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } } @@ -196,7 +196,7 @@ public class CombineVariants extends RodWalker { VariantContext annotatedMergedVC = VariantContext.modifyAttributes(mergedVC, attributes); if ( minimalVCF ) annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, Arrays.asList(SET_KEY)); - vcfWriter.add(annotatedMergedVC, ref.getBase()); + vcfWriter.add(annotatedMergedVC); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java index 2f6c7d99c..751633f9d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java @@ -82,7 +82,7 @@ public class FilterLiftedVariants extends RodWalker { if ( failed ) failedLocs++; else - writer.add(vc, ref[0]); + writer.add(vc); } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java index 5b83ae688..d8bcb252d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java @@ -94,10 +94,10 @@ public class LeftAlignVariants extends RodWalker { private int alignAndWrite(VariantContext vc, final ReferenceContext ref) { - if ( vc.isBiallelic() && vc.isIndel() ) + if ( vc.isBiallelic() && vc.isIndel() && !vc.isComplexIndel() ) return writeLeftAlignedIndel(vc, ref); else { - writer.add(vc, ref.getBase()); + writer.add(vc); return 0; } } @@ -113,7 +113,7 @@ public class LeftAlignVariants extends RodWalker { indelLength = vc.getAlternateAllele(0).length(); if ( indelLength > 200 ) { - writer.add(vc, ref.getBase()); + writer.add(vc); return 0; } @@ -141,17 +141,12 @@ public class LeftAlignVariants extends RodWalker { byte[] newBases = new byte[indelLength]; System.arraycopy((vc.isDeletion() ? refSeq : originalIndel), indelIndex, newBases, 0, indelLength); Allele newAllele = Allele.create(newBases, vc.isDeletion()); - newVC = updateAllele(newVC, newAllele); + newVC = updateAllele(newVC, newAllele, refSeq[indelIndex-1]); - // we need to update the reference base just in case it changed - Map attrs = new HashMap(newVC.getAttributes()); - attrs.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, refSeq[indelIndex-1]); - newVC = VariantContext.modifyAttributes(newVC, attrs); - - writer.add(newVC, refSeq[indelIndex-1]); + writer.add(newVC); return 1; } else { - writer.add(vc, ref.getBase()); + writer.add(vc); return 0; } } @@ -177,7 +172,7 @@ public class LeftAlignVariants extends RodWalker { return hap; } - public static VariantContext updateAllele(VariantContext vc, Allele newAllele) { + public static VariantContext updateAllele(VariantContext vc, Allele newAllele, Byte refBaseForIndel) { // create a mapping from original allele to new allele HashMap alleleMap = new HashMap(vc.getAlleles().size()); if ( newAllele.isReference() ) { @@ -201,6 +196,6 @@ public class LeftAlignVariants extends RodWalker { newGenotypes.put(genotype.getKey(), Genotype.modifyAlleles(genotype.getValue(), newAlleles)); } - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), alleleMap.values(), newGenotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes()); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), alleleMap.values(), newGenotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes(), refBaseForIndel); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index eeb85d02d..f3f0085f9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -128,14 +128,14 @@ public class LiftoverVariants extends RodWalker { vc = VariantContext.modifyAttributes(vc, attrs); } - VariantContext newVC = VariantContext.createVariantContextWithPaddedAlleles(vc, ref.getBase(), false); + VariantContext newVC = VariantContext.createVariantContextWithPaddedAlleles(vc, false); if ( originalVC.isSNP() && originalVC.isBiallelic() && VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(newVC) ) { logger.warn(String.format("VCF at %s / %d => %s / %d is switching substitution type %s/%s to %s/%s", originalVC.getChr(), originalVC.getStart(), newVC.getChr(), newVC.getStart(), originalVC.getReference(), originalVC.getAlternateAllele(0), newVC.getReference(), newVC.getAlternateAllele(0))); } - writer.add(vc, ref.getBase()); + writer.add(vc); successfulIntervals++; } else { failedIntervals++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java index ddbb1ed56..5f5c9547b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java @@ -98,9 +98,9 @@ public class RandomlySplitVariants extends RodWalker { for ( VariantContext vc : vcs ) { int random = GenomeAnalysisEngine.getRandomGenerator().nextInt(1000); if ( random < iFraction ) - vcfWriter1.add(vc, ref.getBase()); + vcfWriter1.add(vc); else - vcfWriter2.add(vc, ref.getBase()); + vcfWriter2.add(vc); } return 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 893edec4e..063b005a6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -32,6 +32,8 @@ import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -126,16 +128,13 @@ public class SelectVariants extends RodWalker { /* Private class used to store the intermediate variants in the integer random selection process */ private class RandomVariantStructure { private VariantContext vc; - private byte refBase; - RandomVariantStructure(VariantContext vcP, byte refBaseP) { + RandomVariantStructure(VariantContext vcP) { vc = vcP; - refBase = refBaseP; } - public void set (VariantContext vcP, byte refBaseP) { + public void set (VariantContext vcP) { vc = vcP; - refBase = refBaseP; } } @@ -356,7 +355,7 @@ public class SelectVariants extends RodWalker { randomlyAddVariant(++variantNumber, sub, ref.getBase()); } else if (!SELECT_RANDOM_FRACTION || (!KEEP_AF_SPECTRUM && GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { - vcfWriter.add(sub, ref.getBase()); + vcfWriter.add(sub); } else { if (SELECT_RANDOM_FRACTION && KEEP_AF_SPECTRUM ) { @@ -404,7 +403,7 @@ public class SelectVariants extends RodWalker { //System.out.format("%s .. %4.4f\n",afo.toString(), af); if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom * afBoost * afBoost) - vcfWriter.add(sub, ref.getBase()); + vcfWriter.add(sub); } @@ -511,7 +510,7 @@ public class SelectVariants extends RodWalker { if (SELECT_RANDOM_NUMBER) { int positionToPrint = positionToAdd; for (int i=0; i { private void randomlyAddVariant(int rank, VariantContext vc, byte refBase) { if (nVariantsAdded < numRandom) - variantArray[nVariantsAdded++] = new RandomVariantStructure(vc, refBase); + variantArray[nVariantsAdded++] = new RandomVariantStructure(vc); else { double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble(); double t = (1.0/(rank-numRandom+1)); if ( v < t) { - variantArray[positionToAdd].set(vc, refBase); + variantArray[positionToAdd].set(vc); nVariantsAdded++; positionToAdd = nextCircularPosition(positionToAdd); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java index cb8be9c17..c6f965423 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java @@ -70,7 +70,7 @@ public class VariantValidationAssessor extends RodWalker sampleNames = null; // variant context records - private ArrayList> records = new ArrayList>(); + private ArrayList records = new ArrayList(); // statistics private int numRecords = 0; @@ -91,7 +91,7 @@ public class VariantValidationAssessor extends RodWalker map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) return null; @@ -106,7 +106,7 @@ public class VariantValidationAssessor extends RodWalker call, Integer numVariants) { + public Integer reduce(VariantContext call, Integer numVariants) { if ( call != null ) { numVariants++; records.add(call); @@ -156,12 +156,12 @@ public class VariantValidationAssessor extends RodWalker record : records ) - vcfwriter.add(record.first, record.second); + for ( VariantContext record : records ) + vcfwriter.add(record); } - private Pair addVariantInformationToCall(ReferenceContext ref, VariantContext vContext) { + private VariantContext addVariantInformationToCall(ReferenceContext ref, VariantContext vContext) { // check possible filters double hwPvalue = hardyWeinbergCalculation(vContext); @@ -203,9 +203,7 @@ public class VariantValidationAssessor extends RodWalker(vContext, ref.getBase()); + return VariantContext.modifyAttributes(vContext, infoMap); } private double hardyWeinbergCalculation(VariantContext vc) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 3bf615c94..8c1c33f1b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -83,8 +83,8 @@ public class VariantsToTable extends RodWalker { getters.put("REF", new Getter() { public String get(VariantContext vc) { String x = ""; - if (vc.hasAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)) { - Byte refByte = (Byte)(vc.getAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)); + if ( vc.hasReferenceBaseForIndel() ) { + Byte refByte = vc.getReferenceBaseForIndel(); x=x+new String(new byte[]{refByte}); } return x+vc.getReference().getDisplayString(); @@ -95,8 +95,8 @@ public class VariantsToTable extends RodWalker { StringBuilder x = new StringBuilder(); int n = vc.getAlternateAlleles().size(); if ( n == 0 ) return "."; - if (vc.hasAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)) { - Byte refByte = (Byte)(vc.getAttribute(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)); + if ( vc.hasReferenceBaseForIndel() ) { + Byte refByte = vc.getReferenceBaseForIndel(); x.append(new String(new byte[]{refByte})); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index 59cf365c4..232ae91a8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -154,9 +154,10 @@ public class VariantsToVCF extends RodWalker { VariantContext vc = VariantContextAdaptors.toVariantContext(variants.getName(), hapmap, ref); if ( vc != null ) { if ( refBase != null ) { - Map attrs = new HashMap(vc.getAttributes()); - attrs.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, refBase); - vc = VariantContext.modifyAttributes(vc, attrs); + // TODO -- fix me + //Map attrs = new HashMap(vc.getAttributes()); + //attrs.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, refBase); + //vc = VariantContext.modifyAttributes(vc, attrs); } hapmapVCs.add(vc); } @@ -238,7 +239,7 @@ public class VariantsToVCF extends RodWalker { } vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); - vcfwriter.add(vc, ref); + vcfwriter.add(vc); } public Integer reduceInit() { diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 6a50badce..015e5d6f6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -42,6 +42,21 @@ public class Utils { /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(Utils.class); + public static final float JAVA_DEFAULT_HASH_LOAD_FACTOR = 0.75f; + + /** + * Calculates the optimum initial size for a hash table given the maximum number + * of elements it will need to hold. The optimum size is the smallest size that + * is guaranteed not to result in any rehash/table-resize operations. + * + * @param maxElements The maximum number of elements you expect the hash table + * will need to hold + * @return The optimum initial size for the table, given maxElements + */ + public static int optimumHashSize ( int maxElements ) { + return (int)(maxElements / JAVA_DEFAULT_HASH_LOAD_FACTOR) + 2; + } + public static String getClassName(Class c) { String FQClassName = c.getName(); int firstChar; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 710127f7a..9788f8654 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -567,7 +567,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, // set the reference base for indels in the attributes Map attributes = new TreeMap(inputVC.getAttributes()); - attributes.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, new Byte(inputVC.getReference().getBases()[0])); Map originalToTrimmedAlleleMap = new HashMap(); @@ -611,7 +610,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles)); } - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes); + return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0])); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java index 311aaecf7..c299511db 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java @@ -105,9 +105,8 @@ public abstract class SortingVCFWriterBase implements VCFWriter { * add a record to the file * * @param vc the Variant Context object - * @param refBase the ref base */ - public void add(VariantContext vc, byte refBase) { + public void add(VariantContext vc) { /* Note that the code below does not prevent the successive add()-ing of: (chr1, 10), (chr20, 200), (chr15, 100) since there is no implicit ordering of chromosomes: */ @@ -122,7 +121,7 @@ public abstract class SortingVCFWriterBase implements VCFWriter { noteCurrentRecord(vc); // possibly overwritten - queue.add(new VCFRecord(vc, refBase)); + queue.add(new VCFRecord(vc)); emitSafeRecords(); } @@ -133,7 +132,7 @@ public abstract class SortingVCFWriterBase implements VCFWriter { // No need to wait, waiting for nothing, or before what we're waiting for: if (emitUnsafe || mostUpstreamWritableLoc == null || firstRec.vc.getStart() <= mostUpstreamWritableLoc) { queue.poll(); - innerWriter.add(firstRec.vc, firstRec.refBase); + innerWriter.add(firstRec.vc); } else { break; @@ -143,7 +142,7 @@ public abstract class SortingVCFWriterBase implements VCFWriter { /** * Gets a string representation of this object. - * @return + * @return a string representation of this object */ @Override public String toString() { @@ -158,11 +157,9 @@ public abstract class SortingVCFWriterBase implements VCFWriter { private static class VCFRecord { public VariantContext vc; - public byte refBase; - public VCFRecord(VariantContext vc, byte refBase) { + public VCFRecord(VariantContext vc) { this.vc = vc; - this.refBase = refBase; } } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index b7f4be39a..d3705813c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -202,20 +202,18 @@ public class StandardVCFWriter implements VCFWriter { * add a record to the file * * @param vc the Variant Context object - * @param refBase the ref base used for indels */ - public void add(VariantContext vc, byte refBase) { - add(vc, refBase, false); + public void add(VariantContext vc) { + add(vc, false); } /** * add a record to the file * * @param vc the Variant Context object - * @param refBase the ref base used for indels * @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER THE EVENT INSTEAD) */ - public void add(VariantContext vc, byte refBase, boolean refBaseShouldBeAppliedToEndOfAlleles) { + public void add(VariantContext vc, boolean refBaseShouldBeAppliedToEndOfAlleles) { if ( mHeader == null ) throw new IllegalStateException("The VCF Header must be written before records can be added: " + locationString()); @@ -223,7 +221,7 @@ public class StandardVCFWriter implements VCFWriter { vc = VariantContext.modifyGenotypes(vc, null); try { - vc = VariantContext.createVariantContextWithPaddedAlleles(vc, refBase, refBaseShouldBeAppliedToEndOfAlleles); + vc = VariantContext.createVariantContextWithPaddedAlleles(vc, refBaseShouldBeAppliedToEndOfAlleles); // if we are doing on the fly indexing, add the record ***before*** we write any bytes if ( indexer != null ) indexer.addFeature(vc, positionalStream.getPosition()); @@ -285,7 +283,7 @@ public class StandardVCFWriter implements VCFWriter { Map infoFields = new TreeMap(); for ( Map.Entry field : vc.getAttributes().entrySet() ) { String key = field.getKey(); - if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) + if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) continue; String outputValue = formatVCFField(field.getValue()); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java index 0d23fe455..55749d26e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java @@ -14,5 +14,5 @@ public interface VCFWriter { */ public void close(); - public void add(VariantContext vc, byte refBase); + public void add(VariantContext vc); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java b/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java index 1d4251542..3159f3fb7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java @@ -116,4 +116,57 @@ public class TextFormattingUtils { return bundle; } + + /** + * Returns the word starting positions within line, excluding the first position 0. + * The returned list is compatible with splitFixedWidth. + * @param line Text to parse. + * @return the word starting positions within line, excluding the first position 0. + */ + public static List getWordStarts(String line) { + if (line == null) + throw new ReviewedStingException("line is null"); + List starts = new ArrayList(); + int stop = line.length(); + for (int i = 1; i < stop; i++) + if (Character.isWhitespace(line.charAt(i-1))) + if(!Character.isWhitespace(line.charAt(i))) + starts.add(i); + return starts; + } + + /** + * Parses a fixed width line of text. + * @param line Text to parse. + * @param columnStarts the column starting positions within line, excluding the first position 0. + * @return The parsed string array with each entry trimmed. + */ + public static String[] splitFixedWidth(String line, List columnStarts) { + if (line == null) + throw new ReviewedStingException("line is null"); + if (columnStarts == null) + throw new ReviewedStingException("columnStarts is null"); + int startCount = columnStarts.size(); + String[] row = new String[startCount + 1]; + if (startCount == 0) { + row[0] = line.trim(); + } else { + row[0] = line.substring(0, columnStarts.get(0)).trim(); + for (int i = 1; i < startCount; i++) + row[i] = line.substring(columnStarts.get(i - 1), columnStarts.get(i)).trim(); + row[startCount] = line.substring(columnStarts.get(startCount - 1)).trim(); + } + return row; + } + + /** + * Parses a line of text by whitespace. + * @param line Text to parse. + * @return The parsed string array. + */ + public static String[] splitWhiteSpace(String line) { + if (line == null) + throw new ReviewedStingException("line is null"); + return line.trim().split("\\s+"); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java index a191670a4..a752f4a1b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java @@ -27,15 +27,15 @@ public class MutableVariantContext extends VariantContext { } public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); + super(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); } public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes) { - this(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); + super(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); } public MutableVariantContext(VariantContext parent) { - this(parent.getSource(), parent.contig, parent.start, parent.stop, parent.getAlleles(), parent.getGenotypes(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes()); + super(parent.getSource(), parent.contig, parent.start, parent.stop, parent.getAlleles(), parent.getGenotypes(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.getReferenceBaseForIndel()); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index eab392c4d..1712f6f7b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -5,6 +5,7 @@ import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; @@ -163,11 +164,12 @@ import java.util.*; public class VariantContext implements Feature { // to enable tribble intergration protected InferredGeneticContext commonInfo = null; public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; - public final static String REFERENCE_BASE_FOR_INDEL_KEY = "_REFERENCE_BASE_FOR_INDEL_"; public final static String UNPARSED_GENOTYPE_MAP_KEY = "_UNPARSED_GENOTYPE_MAP_"; public final static String UNPARSED_GENOTYPE_PARSER_KEY = "_UNPARSED_GENOTYPE_PARSER_"; public final static String ID_KEY = "ID"; + private final Byte REFERENCE_BASE_FOR_INDEL; + public final static Set PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet()); /** The location of this VariantContext */ @@ -205,6 +207,24 @@ public class VariantContext implements Feature { // to enable tribble intergrati // --------------------------------------------------------------------------------------------------------- + /** + * the complete constructor. Makes a complete VariantContext from its arguments + * + * @param source source + * @param contig the contig + * @param start the start base (one based) + * @param stop the stop reference base (one based) + * @param alleles alleles + * @param genotypes genotypes map + * @param negLog10PError qual + * @param filters filters: use null for unfiltered and empty set for passes filters + * @param attributes attributes + * @param referenceBaseForIndel padded reference base + */ + public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { + this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, referenceBaseForIndel, false); + } + /** * the complete constructor. Makes a complete VariantContext from its arguments * @@ -219,7 +239,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes attributes */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, false); + this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, null, false); } /** @@ -239,7 +259,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes attributes */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, true); + this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, null, true); } /** @@ -256,7 +276,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes attributes */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes, false); + this(source, contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes, null, false); } /** @@ -269,7 +289,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param alleles alleles */ public VariantContext(String source, String contig, long start, long stop, Collection alleles) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, false); + this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, null, false); } /** @@ -292,7 +312,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param other the VariantContext to copy */ public VariantContext(VariantContext other) { - this(other.getSource(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), false); + this(other.getSource(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, false); } /** @@ -307,8 +327,13 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param negLog10PError qual * @param filters filters: use null for unfiltered and empty set for passes filters * @param attributes attributes + * @param referenceBaseForIndel padded reference base + * @param genotypesAreUnparsed true if the genotypes have not yet been parsed */ - private VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes, boolean genotypesAreUnparsed) { + private VariantContext(String source, String contig, long start, long stop, + Collection alleles, Map genotypes, + double negLog10PError, Set filters, Map attributes, + Byte referenceBaseForIndel, boolean genotypesAreUnparsed) { if ( contig == null ) { throw new IllegalArgumentException("Contig cannot be null"); } this.contig = contig; this.start = start; @@ -323,6 +348,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati this.commonInfo = new InferredGeneticContext(source, negLog10PError, filters, attributes); filtersWereAppliedToContext = filters != null; + REFERENCE_BASE_FOR_INDEL = referenceBaseForIndel; if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); } @@ -355,23 +381,23 @@ public class VariantContext implements Feature { // to enable tribble intergrati // --------------------------------------------------------------------------------------------------------- public static VariantContext modifyGenotypes(VariantContext vc, Map genotypes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), false); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), false); } public static VariantContext modifyLocation(VariantContext vc, String chr, int start, int end) { - return new VariantContext(vc.getSource(), chr, start, end, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), true); + return new VariantContext(vc.getSource(), chr, start, end, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true); } public static VariantContext modifyFilters(VariantContext vc, Set filters) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), filters, new HashMap(vc.getAttributes()), true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), filters, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true); } public static VariantContext modifyAttributes(VariantContext vc, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes, true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes, vc.getReferenceBaseForIndel(), true); } public static VariantContext modifyPErrorFiltersAndAttributes(VariantContext vc, double negLog10PError, Set filters, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, negLog10PError, filters, attributes, true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, negLog10PError, filters, attributes, vc.getReferenceBaseForIndel(), true); } // --------------------------------------------------------------------------------------------------------- @@ -414,7 +440,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return vc subcontext */ public VariantContext subContextFromGenotypes(Collection genotypes, Set alleles) { - return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes()); + return new VariantContext(getSource(), contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes(), getReferenceBaseForIndel()); } @@ -603,6 +629,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati return (String)commonInfo.getAttribute(ID_KEY); } + public boolean hasReferenceBaseForIndel() { + return REFERENCE_BASE_FOR_INDEL != null; + } + + // the indel base that gets stripped off for indels + public Byte getReferenceBaseForIndel() { + return REFERENCE_BASE_FOR_INDEL; + } + // --------------------------------------------------------------------------------------------------------- // // get routines to access context info fields @@ -1151,6 +1186,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati private boolean validate(boolean throwException) { try { + validateReferencePadding(); validateAlleles(); validateGenotypes(); } catch ( IllegalArgumentException e ) { @@ -1163,6 +1199,13 @@ public class VariantContext implements Feature { // to enable tribble intergrati return true; } + private void validateReferencePadding() { + boolean needsPadding = hasSymbolicAlleles() || (getReference().length() == getEnd() - getStart()); // off by one because padded base was removed + + if ( needsPadding && !hasReferenceBaseForIndel() ) + throw new ReviewedStingException("Badly formed variant context at location " + getChr() + ":" + getStart() + "; no padded reference base was provided."); + } + private void validateAlleles() { // check alleles boolean alreadySeenRef = false, alreadySeenNull = false; @@ -1221,16 +1264,6 @@ public class VariantContext implements Feature { // to enable tribble intergrati // // --------------------------------------------------------------------------------------------------------- - // the indel base that gets stripped off for indels - public boolean hasReferenceBaseForIndel() { - return hasAttribute(REFERENCE_BASE_FOR_INDEL_KEY); - } - - // the indel base that gets stripped off for indels - public byte getReferenceBaseForIndel() { - return hasReferenceBaseForIndel() ? (Byte)getAttribute(REFERENCE_BASE_FOR_INDEL_KEY) : (byte)'N'; - } - private void determineType() { if ( type == null ) { switch ( getNAlleles() ) { @@ -1357,8 +1390,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati return false; } - public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, byte inputRefBase, boolean refBaseShouldBeAppliedToEndOfAlleles) { - Allele refAllele = inputVC.getReference(); + public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) { // see if we need to pad common reference base from all alleles boolean padVC; @@ -1368,31 +1400,20 @@ public class VariantContext implements Feature { // to enable tribble intergrati long locLength = (inputVC.getEnd() - inputVC.getStart()) + 1; if (inputVC.hasSymbolicAlleles()) padVC = true; - else if (refAllele.length() == locLength) + else if (inputVC.getReference().length() == locLength) padVC = false; - else if (refAllele.length() == locLength-1) + else if (inputVC.getReference().length() == locLength-1) padVC = true; else throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); - // nothing to do if we don't need to pad bases if (padVC) { - Byte refByte; - Map attributes = inputVC.getAttributes(); + if ( !inputVC.hasReferenceBaseForIndel() ) + throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); - // upper-case for consistency; note that we can safely make these casts because the input is constrained to be a byte - inputRefBase = (byte)Character.toUpperCase((char)inputRefBase); - if (attributes.containsKey(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)) - refByte = (Byte)attributes.get(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY); - else if (inputRefBase == 'A' || inputRefBase == 'T' || inputRefBase == 'C' || inputRefBase == 'G' || inputRefBase == 'N') - refByte = inputRefBase; - else - throw new IllegalArgumentException("Error when trying to pad Variant Context at location " + String.valueOf(inputVC.getStart()) - + " in contig " + inputVC.getChr() + - ". Either input reference base ("+(char)inputRefBase+ - ", ascii code="+inputRefBase+") must be a regular base, or input VC must contain reference base key"); + Byte refByte = inputVC.getReferenceBaseForIndel(); List alleles = new ArrayList(); Map genotypes = new TreeMap(); @@ -1444,11 +1465,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati // Do not change the filter state if filters were not applied to this context Set inputVCFilters = inputVC.filtersWereAppliedToContext ? inputVC.getFilters() : null; - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), - inputVCFilters, attributes); - - - + return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVCFilters, inputVC.getAttributes()); } else return inputVC; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 212600360..7d10749ee 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -295,10 +295,7 @@ public class VariantContextUtils { @Requires("vc != null") @Ensures("result != null") public static VariantContext sitesOnlyVariantContext(VariantContext vc) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), - vc.getAlleles(), vc.getNegLog10PError(), - vc.filtersWereApplied() ? vc.getFilters() : null, - vc.getAttributes()); + return VariantContext.modifyGenotypes(vc, null); } /** @@ -449,7 +446,7 @@ public class VariantContextUtils { FilteredRecordMergeType filteredRecordMergeType, GenotypeMergeType genotypeMergeOptions, boolean annotateOrigin, boolean printMessages, byte inputRefBase ) { - return simpleMerge(genomeLocParser, unsortedVCs, priorityListOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, inputRefBase, "set", false, false); + return simpleMerge(genomeLocParser, unsortedVCs, priorityListOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, "set", false, false); } /** @@ -464,7 +461,6 @@ public class VariantContextUtils { * @param genotypeMergeOptions merge option for genotypes * @param annotateOrigin should we annotate the set it came from? * @param printMessages should we print messages? - * @param inputRefBase the ref base * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? @@ -472,7 +468,7 @@ public class VariantContextUtils { */ public static VariantContext simpleMerge(GenomeLocParser genomeLocParser, Collection unsortedVCs, List priorityListOfVCs, FilteredRecordMergeType filteredRecordMergeType, GenotypeMergeType genotypeMergeOptions, - boolean annotateOrigin, boolean printMessages, byte inputRefBase, String setKey, + boolean annotateOrigin, boolean printMessages, String setKey, boolean filteredAreUncalled, boolean mergeInfoWithMaxAC ) { if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; @@ -490,7 +486,7 @@ public class VariantContextUtils { for (VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if ( ! filteredAreUncalled || vc.isNotFiltered() ) - VCs.add(VariantContext.createVariantContextWithPaddedAlleles(vc,inputRefBase,false)); + VCs.add(VariantContext.createVariantContextWithPaddedAlleles(vc, false)); } if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled return null; diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java new file mode 100644 index 000000000..02e1ba99a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.report; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class GATKReportUnitTest extends BaseTest { + @Test + public void testParse() throws Exception { + String reportPath = validationDataLocation + "exampleGATKReport.eval"; + GATKReport report = new GATKReport(reportPath); + + GATKReportTable countVariants = report.getTable("CountVariants"); + Assert.assertEquals(countVariants.getVersion(), GATKReportVersion.V0_1); + Object countVariantsPK = countVariants.getPrimaryKey("none.eval.none.all"); + Assert.assertEquals(countVariants.get(countVariantsPK, "nProcessedLoci"), "100000"); + Assert.assertEquals(countVariants.get(countVariantsPK, "nNoCalls"), "99872"); + + GATKReportTable validationReport = report.getTable("ValidationReport"); + Assert.assertEquals(validationReport.getVersion(), GATKReportVersion.V0_1); + Object validationReportPK = countVariants.getPrimaryKey("none.eval.none.known"); + Assert.assertEquals(validationReport.get(validationReportPK, "sensitivity"), "NaN"); + + GATKReportTable simpleMetricsByAC = report.getTable("SimpleMetricsByAC.metrics"); + Assert.assertEquals(simpleMetricsByAC.getVersion(), GATKReportVersion.V0_1); + Object simpleMetricsByACPK = simpleMetricsByAC.getPrimaryKey("none.eval.none.novel.ac2"); + Assert.assertEquals(simpleMetricsByAC.get(simpleMetricsByACPK, "AC"), "2"); + + Assert.assertFalse(simpleMetricsByAC.containsPrimaryKey("none.eval.none.novel.ac2.bad")); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java index 77159d9c2..f9aaaecc1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java @@ -30,8 +30,6 @@ import org.testng.annotations.Test; import java.io.File; import java.util.Arrays; -import java.util.Collections; -import java.util.List; public class DiffObjectsIntegrationTest extends WalkerTest { private class TestParams extends TestDataProvider { @@ -52,8 +50,8 @@ public class DiffObjectsIntegrationTest extends WalkerTest { @DataProvider(name = "data") public Object[][] createData() { - new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "4d9f4636de05b93c354d05011264546e"); - new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "37e6efd833b5cd6d860a9df3df9713fc"); + new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "92311de76dda3f38aac289d807ef23d0"); + new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "0c69412c385fda50210f2a612e1ffe4a"); return TestParams.getTests(TestParams.class); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java index 1a063ec2a..f89b80ead 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java @@ -16,7 +16,7 @@ public class RecalibrationWalkersPerformanceTest extends WalkerTest { " -L chr1:1-50,000,000" + " -standard" + " -OQ" + - " -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_132.b36.excluding_sites_after_129.vcf" + + " -B:dbsnp,VCF " + GATKDataLocation + "dbsnp_132_hg18.vcf" + " -recalFile /dev/null" + moreArgs, 0, new ArrayList(0)); @@ -31,7 +31,7 @@ public class RecalibrationWalkersPerformanceTest extends WalkerTest { " -L " + evaluationDataLocation + "whole_exome_agilent_designed_120.targets.chr1.interval_list" + " -standard" + " -OQ" + - " -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_132.b36.excluding_sites_after_129.vcf" + + " -B:dbsnp,VCF " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -recalFile /dev/null" + moreArgs, 0, new ArrayList(0)); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 904a5b29b..9b152bc71 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -120,6 +120,6 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void complexTestFull() { combineComplexSites("", "b5a53ee92bdaacd2bb3327e9004ae058"); } @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "df96cb3beb2dbb5e02f80abec7d3571e"); } - @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "f72a178137e25dbe0b931934cdc0079d"); } + @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "f704caeaaaed6711943014b847fe381a"); } @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "f704caeaaaed6711943014b847fe381a"); } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java new file mode 100644 index 000000000..da6277242 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Tests LeftAlignVariants + */ +public class LeftAlignVariantsIntegrationTest extends WalkerTest { + + @Test + public void testLeftAlignment() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignVariants -o %s -R " + b37KGReference + " -B:variant,vcf " + validationDataLocation + "forLeftAlignVariantsTest.vcf -NO_HEADER", + 1, + Arrays.asList("158b1d71b28c52e2789f164500b53732")); + executeTest("test left alignment", spec); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index 113bd5491..458233b09 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -98,7 +98,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " -EV CompOverlap -noEV -noST" + " -o %s", 1, - Arrays.asList("f60729c900bc8368717653b3fad80d1e") //"f60729c900bc8368717653b3fad80d1e" + Arrays.asList("ea09bf764adba9765b99921c5ba2c709") ); executeTest("testVCFStreamingChain", selectTestSpec); diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java index 68a2ecf8d..d08cda949 100755 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java @@ -70,7 +70,7 @@ public class IndexFactoryUnitTest { CloseableTribbleIterator it = source.iterator(); while (it.hasNext() && (counter++ < maxRecords || maxRecords == -1) ) { VariantContext vc = it.next(); - writer.add(vc, vc.getReferenceBaseForIndel()); + writer.add(vc); } writer.close(); diff --git a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index 34a2e616a..e3a926fb9 100644 --- a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -57,8 +57,8 @@ public class VCFWriterUnitTest extends BaseTest { VCFHeader header = createFakeHeader(metaData,additionalColumns); VCFWriter writer = new StandardVCFWriter(fakeVCFFile); writer.writeHeader(header); - writer.add(createVC(header),"A".getBytes()[0]); - writer.add(createVC(header),"A".getBytes()[0]); + writer.add(createVC(header)); + writer.add(createVC(header)); writer.close(); VCFCodec reader = new VCFCodec(); AsciiLineReader lineReader; @@ -135,7 +135,7 @@ public class VCFWriterUnitTest extends BaseTest { genotypes.put(name,gt); } - return new VariantContext("RANDOM",loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, 0, filters, attributes); + return new VariantContext("RANDOM",loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, 0, filters, attributes, (byte)'A'); } diff --git a/public/java/test/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java new file mode 100644 index 000000000..45a618f71 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.text; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Collections; + +public class TextFormattingUtilsUnitTest extends BaseTest { + @Test(expectedExceptions = ReviewedStingException.class) + public void testSplitWhiteSpaceNullLine() { + TextFormattingUtils.splitWhiteSpace(null); + } + + @Test + public void testSplitWhiteSpace() { + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace("foo bar baz"), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace("foo bar baz"), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace(" foo bar baz"), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace(" foo bar baz "), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace("foo bar baz "), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace("\tfoo\tbar\tbaz\t"), new String[]{"foo", "bar", "baz"}); + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testGetWordStartsNullLine() { + TextFormattingUtils.getWordStarts(null); + } + + @Test + public void testGetWordStarts() { + Assert.assertEquals(TextFormattingUtils.getWordStarts("foo bar baz"), Arrays.asList(4, 8)); + Assert.assertEquals(TextFormattingUtils.getWordStarts("foo bar baz"), Arrays.asList(5, 10)); + Assert.assertEquals(TextFormattingUtils.getWordStarts(" foo bar baz"), Arrays.asList(1, 5, 9)); + Assert.assertEquals(TextFormattingUtils.getWordStarts(" foo bar baz "), Arrays.asList(1, 5, 9)); + Assert.assertEquals(TextFormattingUtils.getWordStarts("foo bar baz "), Arrays.asList(4, 8)); + Assert.assertEquals(TextFormattingUtils.getWordStarts("\tfoo\tbar\tbaz\t"), Arrays.asList(1, 5, 9)); + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testSplitFixedWidthNullLine() { + TextFormattingUtils.splitFixedWidth(null, Collections.emptyList()); + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testSplitFixedWidthNullColumnStarts() { + TextFormattingUtils.splitFixedWidth("foo bar baz", null); + } + + @Test + public void testSplitFixedWidth() { + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("foo bar baz", Arrays.asList(4, 8)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("foo bar baz", Arrays.asList(5, 10)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth(" foo bar baz", Arrays.asList(5, 9)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth(" foo bar baz ", Arrays.asList(5, 9)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("foo bar baz ", Arrays.asList(4, 8)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("\tfoo\tbar\tbaz\t", Arrays.asList(5, 9)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("f o b r b z", Arrays.asList(4, 8)), new String[] { "f o", "b r", "b z" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth(" f o b r b z", Arrays.asList(4, 8)), new String[] { "f o", "b r", "b z" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth(" f o b r b z", Arrays.asList(4, 8)), new String[] { "f", "o b", "r b z" }); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java index e82817714..d8fa0eae4 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java @@ -92,45 +92,45 @@ public class VariantContextUnitTest { // test INDELs alleles = Arrays.asList(Aref, ATC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(Tref, TA, TC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A, AC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A, Allele.create("ATCTC")); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); // test MIXED alleles = Arrays.asList(TAref, T, TC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(TAref, T, AC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(ACref, ATC, AT); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(Aref, T, symbolic); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); // test SYMBOLIC alleles = Arrays.asList(Tref, symbolic); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.SYMBOLIC); } @@ -191,7 +191,7 @@ public class VariantContextUnitTest { @Test public void testCreatingDeletionVariantContext() { List alleles = Arrays.asList(ATCref, del); - VariantContext vc = new VariantContext("test", delLoc, delLocStart, delLocStop, alleles); + VariantContext vc = new VariantContext("test", delLoc, delLocStart, delLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getChr(), delLoc); Assert.assertEquals(vc.getStart(), delLocStart); @@ -218,7 +218,7 @@ public class VariantContextUnitTest { @Test public void testCreatingInsertionVariantContext() { List alleles = Arrays.asList(delRef, ATC); - VariantContext vc = new VariantContext("test", insLoc, insLocStart, insLocStop, alleles); + VariantContext vc = new VariantContext("test", insLoc, insLocStart, insLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getChr(), insLoc); Assert.assertEquals(vc.getStart(), insLocStart); @@ -251,7 +251,7 @@ public class VariantContextUnitTest { new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, del)); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = IllegalStateException.class) public void testBadConstructorArgs3() { new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(del)); } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala index 4a93233eb..1d473b210 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala @@ -59,10 +59,10 @@ class ExampleUnifiedGenotyper extends QScript { evalUnfiltered.rodBind :+= RodBind("eval", "VCF", genotyper.out) evalUnfiltered.out = swapExt(genotyper.out, "vcf", "eval") - variantFilter.rodBind :+= RodBind("vcf", "VCF", genotyper.out) + variantFilter.rodBind :+= RodBind("variant", "VCF", genotyper.out) variantFilter.out = swapExt(qscript.bamFile, "bam", "filtered.vcf") variantFilter.filterName = filterNames - variantFilter.filterExpression = filterExpressions + variantFilter.filterExpression = filterExpressions.map("\"" + _ + "\"") evalFiltered.rodBind :+= RodBind("eval", "VCF", variantFilter.out) evalFiltered.out = swapExt(variantFilter.out, "vcf", "eval") diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala index c2c956118..27ac559c5 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala @@ -34,8 +34,8 @@ import org.broadinstitute.sting.BaseTest import org.broadinstitute.sting.MD5DB import org.broadinstitute.sting.queue.QCommandLine import org.broadinstitute.sting.queue.util.{Logging, ProcessController} -import java.io.{FileNotFoundException, File} -import org.broadinstitute.sting.gatk.report.GATKReportParser +import java.io.File +import org.broadinstitute.sting.gatk.report.GATKReport import org.apache.commons.io.FileUtils import org.broadinstitute.sting.queue.engine.CommandLinePluginManager @@ -118,12 +118,11 @@ object PipelineTest extends BaseTest with Logging { // write the report to the shared validation data location val formatter = new SimpleDateFormat("yyyy.MM.dd.HH.mm.ss") val reportLocation = "%s%s/%s/validation.%s.eval".format(validationReportsDataLocation, jobRunner, name, formatter.format(new Date)) - val report = new File(reportLocation) + val reportFile = new File(reportLocation) - FileUtils.copyFile(new File(runDir(name, jobRunner) + evalSpec.evalReport), report); + FileUtils.copyFile(new File(runDir(name, jobRunner) + evalSpec.evalReport), reportFile); - val parser = new GATKReportParser - parser.parse(report) + val report = new GATKReport(reportFile); var allInRange = true @@ -131,7 +130,9 @@ object PipelineTest extends BaseTest with Logging { println(name + " validation values:") println(" value (min,target,max) table key metric") for (validation <- evalSpec.validations) { - val value = parser.getValue(validation.table, validation.key, validation.metric) + val table = report.getTable(validation.table) + val key = table.getPrimaryKey(validation.key) + val value = String.valueOf(table.get(key, validation.metric)) val inRange = if (value == null) false else validation.inRange(value) val flag = if (!inRange) "*" else " " println(" %s %s (%s,%s,%s) %s %s %s".format(flag, value, validation.min, validation.target, validation.max, validation.table, validation.key, validation.metric))