gatk-3.8/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java

package org.broadinstitute.sting.gatk.refdata;

import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;

import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.xReadLines;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.apache.log4j.Logger;

/**
 * Class for representing arbitrary reference ordered data sets
 *
 * User: mdepristo
 * Date: Feb 27, 2009
 * Time: 10:47:14 AM
 *
 * System for interacting with tabular formatted data of the following format:
 *
 * # comment line
 * # must include HEADER KEYWORD
 * HEADER COL1 ... COLN
 * chr:pos data1 ... dataN
 *
 * The system supports the rod interface.  You can just access tabularRODs through the normal ROD system.
 *
 * You can also write your own files, as such:
 *
 * ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
 * assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3"));
 * String rowData = String.format("%d %d %d", 1, 2, 3);
 * TabularROD row = new TabularROD("myName", header, GenomeLoc.parseGenomeLoc("chrM", 1), rowData.split(" "));
 * assertTrue(row.toString().equals("chrM:1\t1\t2\t3"));
 */
public class TabularROD extends BasicReferenceOrderedDatum implements Map<String, String> {
    private static Logger logger = Logger.getLogger(TabularROD.class);

    protected GenomeLoc loc;
    private HashMap<String, String> attributes;
    private ArrayList<String> header;

    public static String DEFAULT_DELIMITER = "\t";
    public static String DEFAULT_DELIMITER_REGEX = "\\s+";

    public static String DELIMITER = DEFAULT_DELIMITER;
    public static String DELIMITER_REGEX = DEFAULT_DELIMITER_REGEX;

    private static int MAX_LINES_TO_LOOK_FOR_HEADER = 1000;
    private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*((HEADER)|(loc)).*");
    private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");

    private static int parsedRecords = 0;
    final static boolean printRecordsParsed = false;

    /**
     * Set the global tabular ROD delimiter and the regex to split the delimiter.
     *
     * The delimiter to put between fields, while the regex is used to split lines
     * 
     * @param delimiter
     * @param delimeterRegex
     */
    public static void setDelimiter(final String delimiter, final String delimeterRegex) {
        DELIMITER = delimiter;
        DELIMITER_REGEX = delimeterRegex;
    }

    /**
     * Returns a parsable string representation for the
     * @param header
     */
    public static String headerString(final ArrayList<String> header) {
        requireGoodHeader(header);
        return Utils.join(DELIMITER, header);
    }

    /**
     * Returns a comment line containing the *single line* string msg
     * 
     * @param msg
     * @return
     */
    public static String commentString(final String msg) {
        return "# " + msg;
    }

    private static boolean headerIsGood(final ArrayList<String> header) {
        if ( header.size() == 0 ) return false;
        if ( ! header.get(0).equals("HEADER") ) return false;
        return true;
    }

    private static void requireGoodHeader(final ArrayList<String> header) {
        if ( ! headerIsGood(header) )
            throw new RuntimeException("Header must begin with HEADER keyword");
    }

    // ----------------------------------------------------------------------
    //
    // Constructors
    //
    // ----------------------------------------------------------------------
    public TabularROD(final String name) {
        super(name);
        attributes = new HashMap<String, String>();
    }

    /**
     * Make a new TabularROD with name, using header columns header, at loc, without any bound data.  Data
     * must be bound to each corresponding header[i] field before the object is really usable.
     *
     * @param name
     * @param header
     * @param loc
     */
    public TabularROD(final String name, ArrayList<String> header, GenomeLoc loc) {
        this(name);
        this.header = header;
        this.loc = loc;
        requireGoodHeader(this.header);
    }

    /**
     * Make a new TabularROD with name, using header columns header, at loc, with data associated with the
     * header columns.  data and header are assumed to be in the same order, and bindings will be established
     * from header[i] = data[i].  The TabularROD at this stage can be printed, manipulated, it is considered
     * a full fledged, initialized object.
     *
     * @param name
     * @param header
     * @param loc
     * @param data
     */
    public TabularROD(final String name, ArrayList<String> header, GenomeLoc loc, String[] data) {
        this(name, header, loc);
        
        if ( header.size() != data.length + 1 )
            throw new RuntimeException(String.format("Incorrect tabular data format: header has %d columns but %d data elements were provided: %s",
                                                    header.size(), data.length, Utils.join("\t", data)));
        for ( int i = 0; i < data.length; i++ ) {
            put(header.get(i+1), data[i]);
        }
    }

    /**
     * Walks through the source files looking for the header line, which it returns as a
     * list of strings.
     * 
     * @param source
     * @return
     */
    public Object initialize(final File source) throws FileNotFoundException {
        List<String> header = null;
        int linesLookedAt = 0;
        xReadLines reader = new xReadLines(source);

        for ( String line : reader ) {
            Matcher m = HEADER_PATTERN.matcher(line);
            if ( m.matches() ) {
                //System.out.printf("Found a header line: %s%n", line);
                header = new ArrayList<String>(Arrays.asList(line.split(DELIMITER_REGEX)));
                //System.out.printf("HEADER IS %s%n", Utils.join(":", header));
            }

            if ( linesLookedAt++ > MAX_LINES_TO_LOOK_FOR_HEADER )
                break;
        }

        // check that we found the header
        if ( header != null ) {
            logger.debug(String.format("HEADER IS %s%n", Utils.join(":", header)));
        } else {
            // use the indexes as the header fields
            logger.debug("USING INDEXES FOR ROD HEADER");
            // reset if necessary
            if ( !reader.hasNext() )
                reader = new xReadLines(source);
            header = new ArrayList<String>();
            int tokens = reader.next().split(DELIMITER_REGEX).length;
            for ( int i = 0; i < tokens; i++)
                header.add(Integer.toString(i));
        }

        return header;
    }

    // ----------------------------------------------------------------------
    //
    // ROD accessors
    //
    // ----------------------------------------------------------------------
    public GenomeLoc getLocation() {
        if ( loc != null )
            return loc;
        String s = get(header.get(0));
        if ( s == null )
            return null;
        return GenomeLocParser.parseGenomeLoc(s);
    }

    public ArrayList<String> getHeader() {
        return header;
    }

    public String get(final Object key) {
        return attributes.get(key);
    }

    public String put(final String key, final String object) {
        return attributes.put(key, object);
    }

    public boolean containsKey(final Object key) {
        return attributes.containsKey(key);
    }

    public HashMap<String,String> getAttributes() {
        return attributes;
    }

    public String getAttributeString() {
        List<String> strings = new ArrayList<String>(attributes.size());
        for ( String key : header ) {
            if ( containsKey(key) ) { // avoid the header
                strings.add(this.get(key));
                //System.out.printf("Adding %s%n", this.get(key));
            }
        }
        return Utils.join("\t", strings);
    }

    // ----------------------------------------------------------------------
    //
    // map functions
    //
    // ----------------------------------------------------------------------
    public int size()                               { return attributes.size(); }
    public boolean isEmpty()                        { return attributes.isEmpty(); }
    public boolean containsValue(Object o)          { return attributes.containsValue(o); }
    public String remove(Object o)                  { return attributes.remove(o); }
    public void clear()                             { attributes.clear(); }
    public java.util.Set<String> keySet()           { return attributes.keySet(); }
    public java.util.Collection<String> values()    { return attributes.values(); }

    public void putAll(java.util.Map<? extends String, ? extends String> map) {
        attributes.putAll(map);
    }

    public java.util.Set<java.util.Map.Entry<String,String>> entrySet() {
        return attributes.entrySet();
    }

    // ----------------------------------------------------------------------
    //
    // formatting
    //
    // ----------------------------------------------------------------------
    public String toString() {
        if ( loc != null )
            return String.format("%s\t%s", loc, getAttributeString());
        return String.format("%s", getAttributeString());
    }

    /**
     * The delimiter regular expression that should be used to separate fields in data rows
     * and header.
     * 
     * @return
     */
    public String delimiterRegex() {
        return DELIMITER_REGEX;
    }

    /**
     * Used by ROD management system to set the data in this ROD associated with a line in a rod
     * 
     * @param headerObj
     * @param parts
     * @return
     * @throws IOException
     */
    public boolean parseLine(final Object headerObj, final String[] parts) throws IOException {
        header = (ArrayList<String>)(headerObj);

        //System.out.printf("parts [len=%d] is '%s'%n", parts.length, Utils.join(":", parts));

        if ( parts.length == 0 || COMMENT_PATTERN.matcher(parts[0]).matches() || HEADER_PATTERN.matcher(parts[0]).matches() )
            return false;

        if (header.size() != parts.length) {
            throw new IOException(String.format("Header length %d not equal to Tabular parts length %d", header.size(), parts.length));
        }

        for ( int i = 0; i < parts.length; i++ ) {
            put(header.get(i), parts[i]);
        }

        if ( printRecordsParsed ) System.out.printf("Parsed %d records %s%n", ++parsedRecords, this);

        return true;
    }
}