From 457568485ac3ef703ed93a0b1e7aa45a52affe3d Mon Sep 17 00:00:00 2001 From: depristo Date: Tue, 9 Feb 2010 01:21:04 +0000 Subject: [PATCH] simple Beagle input ROD git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2811 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/refdata/BeagleROD.java | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/BeagleROD.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/BeagleROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/BeagleROD.java new file mode 100755 index 000000000..60dcc1f36 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/BeagleROD.java @@ -0,0 +1,87 @@ +package org.broadinstitute.sting.gatk.refdata; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.xReadLines; + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.io.IOException; +import java.io.File; +import java.io.FileNotFoundException; + +public class BeagleROD extends BasicReferenceOrderedDatum { + GenomeLoc loc; + List sampleNames = null; + Map> sampleGenotypes = new HashMap>(); + + public BeagleROD(String name) { + super(name); + } + + public String toString() { return "BeagleRod"; } + + public String delimiterRegex() { + return " "; + } + + public GenomeLoc getLocation() { + return loc; + } + + public List getSampleNames() { + return sampleNames; + } + + public Map> getGenotypes() { + return sampleGenotypes; + } + + public Object initialize(final File source) throws FileNotFoundException { + String firstLine = new xReadLines(source).next(); + String[] parts = firstLine.split(" "); + if ( parts[0].equals("I") ) { + // I id NA12891 NA12891 NA12892 NA12892 + sampleNames = Arrays.asList(parts).subList(2, parts.length); + return sampleNames; + } else { + throw new IllegalStateException("Beagle file " + source + " doesn't have required header line I"); + } + } + + private static Pattern MARKER_PATTERN = Pattern.compile("c(.+)_p([0-9]+)"); + + public static GenomeLoc parseMarkerName(String markerName) { + Matcher m = MARKER_PATTERN.matcher(markerName); + if ( m.matches() ) { + String contig = m.group(1); + long start = Long.valueOf(m.group(2)); + return GenomeLocParser.createGenomeLoc(contig, start, start); + } else { + throw new IllegalArgumentException("Malformatted family structure string: " + markerName + " required format is mom+dad=child"); + } + } + + public boolean parseLine(final Object header, final String[] parts) throws IOException { + //System.out.printf("Parsing beagle parts=%s header=%s%n", parts, header); + List sampleNames = (List)header; + + if ( parts.length == 0 || ! parts[0].equals("M") ) + return false; + else { + loc = parseMarkerName(parts[1]); + + for ( int i = 2; i < parts.length; i++ ) { + String sampleName = sampleNames.get(i-2); + if ( ! sampleGenotypes.containsKey(sampleName) ) { + sampleGenotypes.put(sampleName, new ArrayList()); + } + + sampleGenotypes.get(sampleName).add(parts[i]); + } + + return true; + } + } +}