Significant memory improvements to plink code
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3144 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
75c1987a18
commit
e73e6a4fb0
|
|
@ -114,7 +114,13 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, List<byte[]>> getGenotypes() {
|
/* Get the mapping from sample name to genotypes (array of Alleles).
|
||||||
|
* Important note: none of the Alleles returned here are annotated as reference
|
||||||
|
* (since the rod doesn't know offhand what the reference allele is).
|
||||||
|
*
|
||||||
|
* @return mapping from sample name to genotype
|
||||||
|
*/
|
||||||
|
public Map<String, Allele[]> getGenotypes() {
|
||||||
return currentVariant.getGenotypes();
|
return currentVariant.getGenotypes();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -162,17 +168,15 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
||||||
private ArrayList<PlinkVariantInfo> parseTextFormattedPlinkFile( File file ) {
|
private ArrayList<PlinkVariantInfo> parseTextFormattedPlinkFile( File file ) {
|
||||||
try {
|
try {
|
||||||
BufferedReader reader = new BufferedReader( new FileReader ( file ) );
|
BufferedReader reader = new BufferedReader( new FileReader ( file ) );
|
||||||
String header = reader.readLine();
|
ArrayList<PlinkVariantInfo> seqVars = new ArrayList<PlinkVariantInfo>();
|
||||||
ArrayList<PlinkVariantInfo> seqVars = instantiateVariantListFromHeader(header);
|
int headerFieldCount = instantiateVariantListFromHeader(seqVars, reader.readLine());
|
||||||
ArrayList<Integer> snpOffsets = getSNPOffsetsFromHeader(header);
|
|
||||||
|
|
||||||
sampleNames = new ArrayList<String>();
|
sampleNames = new ArrayList<String>();
|
||||||
|
|
||||||
String line;
|
String line;
|
||||||
long counter = 0;
|
|
||||||
do {
|
do {
|
||||||
line = reader.readLine();
|
line = reader.readLine();
|
||||||
incorporateInfo(seqVars,snpOffsets,line);
|
incorporateInfo(seqVars, line, headerFieldCount);
|
||||||
} while ( line != null );
|
} while ( line != null );
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -188,27 +192,27 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void incorporateInfo(List<PlinkVariantInfo> vars, List<Integer> offsets, String plinkLine) {
|
private void incorporateInfo(List<PlinkVariantInfo> vars, String plinkLine, int headerFieldCount) {
|
||||||
if ( plinkLine == null ) {
|
if ( plinkLine == null ) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
String[] plinkInfo;
|
|
||||||
if ( plinkFileType != PlinkFileType.STANDARD_PED )
|
if ( plinkFileType != PlinkFileType.STANDARD_PED )
|
||||||
throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file.");
|
throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file.");
|
||||||
|
|
||||||
plinkInfo = plinkLine.split("\t");
|
StringTokenizer st = new StringTokenizer(plinkLine, "\t");
|
||||||
String individualName = plinkInfo[1];
|
st.nextToken(); // family ID
|
||||||
sampleNames.add(individualName);
|
sampleNames.add(st.nextToken());
|
||||||
|
for (int i = 2; i < headerFieldCount; i++)
|
||||||
|
st.nextToken();
|
||||||
|
|
||||||
int snpNumber = 0;
|
int snpNumber = 0;
|
||||||
for ( int i : offsets ) {
|
while ( snpNumber < vars.size() ) {
|
||||||
vars.get(snpNumber).addGenotypeEntry(plinkInfo[i].split("\\s+"), individualName);
|
vars.get(snpNumber++).addGenotypeEntry(st.nextToken().split("\\s+"));
|
||||||
snpNumber++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private ArrayList<PlinkVariantInfo> instantiateVariantListFromHeader(String header) {
|
private int instantiateVariantListFromHeader(ArrayList<PlinkVariantInfo> seqVars, String header) {
|
||||||
// if the first line is not a comment (what we're used to seeing),
|
// if the first line is not a comment (what we're used to seeing),
|
||||||
// then it's the raw header (comes from de-binary-ing a .bed file)
|
// then it's the raw header (comes from de-binary-ing a .bed file)
|
||||||
if ( !header.startsWith("#") )
|
if ( !header.startsWith("#") )
|
||||||
|
|
@ -216,38 +220,18 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
||||||
|
|
||||||
plinkFileType = PlinkFileType.STANDARD_PED;
|
plinkFileType = PlinkFileType.STANDARD_PED;
|
||||||
|
|
||||||
ArrayList<PlinkVariantInfo> seqVars = new ArrayList<PlinkVariantInfo>();
|
|
||||||
String[] headerFields = header.split("\t");
|
String[] headerFields = header.split("\t");
|
||||||
|
|
||||||
|
int skippedFields = 0;
|
||||||
for ( String field : headerFields ) {
|
for ( String field : headerFields ) {
|
||||||
if ( ! headerEntries.contains(field) ) {
|
if ( headerEntries.contains(field) )
|
||||||
|
skippedFields++;
|
||||||
|
else
|
||||||
// not a standard header, so a variant
|
// not a standard header, so a variant
|
||||||
seqVars.add(new PlinkVariantInfo(field));
|
seqVars.add(new PlinkVariantInfo(field));
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return seqVars;
|
return skippedFields;
|
||||||
}
|
|
||||||
|
|
||||||
private ArrayList<Integer> getSNPOffsetsFromHeader(String header) {
|
|
||||||
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
|
||||||
String[] headerFields;
|
|
||||||
if ( plinkFileType == PlinkFileType.STANDARD_PED ) {
|
|
||||||
headerFields = header.split("\t+");
|
|
||||||
} else {
|
|
||||||
headerFields = header.split("\\s+");
|
|
||||||
}
|
|
||||||
|
|
||||||
int offset = 0;
|
|
||||||
|
|
||||||
for ( String field : headerFields ) {
|
|
||||||
if ( ! headerEntries.contains(field) ) {
|
|
||||||
offsets.add(offset);
|
|
||||||
}
|
|
||||||
offset++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return offsets;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
|
/* *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
|
||||||
|
|
@ -396,7 +380,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
||||||
// four genotypes encoded in this byte
|
// four genotypes encoded in this byte
|
||||||
int[] genotypes = parseGenotypes(genotype);
|
int[] genotypes = parseGenotypes(genotype);
|
||||||
for ( int g : genotypes ) {
|
for ( int g : genotypes ) {
|
||||||
variants.get(snpOffset).addBinaryGenotypeEntry(g,sampleNames.get(sampleOffset));
|
variants.get(snpOffset).addBinaryGenotypeEntry(g);
|
||||||
|
|
||||||
if ( major ) {
|
if ( major ) {
|
||||||
sampleOffset++;
|
sampleOffset++;
|
||||||
|
|
@ -420,13 +404,17 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
|
||||||
genotypes[3] = ( ( genotype & 192 ) >>> 6 );
|
genotypes[3] = ( ( genotype & 192 ) >>> 6 );
|
||||||
return genotypes;
|
return genotypes;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
class PlinkVariantInfo implements Comparable {
|
class PlinkVariantInfo implements Comparable {
|
||||||
|
|
||||||
private String variantName;
|
private String variantName;
|
||||||
private GenomeLoc loc;
|
private GenomeLoc loc;
|
||||||
private Map<String, List<byte[]>> genotypes = new HashMap<String, List<byte[]>>();
|
|
||||||
|
// the list of genotypes in the same order as in sampleNames (using a map here is inefficient)
|
||||||
|
private List<Allele[]> genotypes = new ArrayList<Allele[]>();
|
||||||
|
|
||||||
|
// map of Alleles seen (so that we can share Allele objects among samples)
|
||||||
|
HashMap<String, Allele> alleles = new HashMap<String, Allele>(4);
|
||||||
|
|
||||||
// for indels
|
// for indels
|
||||||
private boolean isIndel = false;
|
private boolean isIndel = false;
|
||||||
|
|
@ -451,8 +439,12 @@ class PlinkVariantInfo implements Comparable {
|
||||||
return variantName;
|
return variantName;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, List<byte[]>> getGenotypes() {
|
public Map<String, Allele[]> getGenotypes() {
|
||||||
return genotypes;
|
Map<String, Allele[]> genotypeMap = new HashMap<String, Allele[]>();
|
||||||
|
int index = 0;
|
||||||
|
for ( Allele[] myAlleles : genotypes )
|
||||||
|
genotypeMap.put(sampleNames.get(index++), myAlleles);
|
||||||
|
return genotypeMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isIndel() {
|
public boolean isIndel() {
|
||||||
|
|
@ -510,21 +502,36 @@ class PlinkVariantInfo implements Comparable {
|
||||||
locAllele2 = al2;
|
locAllele2 = al2;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addGenotypeEntry(String[] alleleStrings, String sampleName) {
|
public void addGenotypeEntry(String[] alleleStrings) {
|
||||||
|
|
||||||
ArrayList<byte[]> alleles = new ArrayList<byte[]>(2);
|
Allele[] myAlleles = new Allele[2];
|
||||||
|
|
||||||
for ( String alleleString : alleleStrings ) {
|
for (int i = 0; i < 2; i++) {
|
||||||
if ( alleleString.equals(PlinkRod.SEQUENOM_NO_CALL) )
|
if ( alleleStrings.length <= i ) {
|
||||||
alleles.add(Allele.NO_CALL_STRING.getBytes());
|
myAlleles[i] = null;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
String alleleString = alleleStrings[i];
|
||||||
|
|
||||||
|
Allele allele;
|
||||||
|
if ( alleles.containsKey(alleleString) ) {
|
||||||
|
allele = alleles.get(alleleString);
|
||||||
|
} else {
|
||||||
|
if ( PlinkRod.SEQUENOM_NO_CALL.equals(alleleString) )
|
||||||
|
allele = Allele.NO_CALL;
|
||||||
else
|
else
|
||||||
alleles.add(alleleString.getBytes());
|
allele = new Allele(alleleString);
|
||||||
|
alleles.put(alleleString, allele);
|
||||||
}
|
}
|
||||||
|
|
||||||
genotypes.put(sampleName, alleles);
|
myAlleles[i] = allele;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addBinaryGenotypeEntry( int genoTYPE, String sampleName ) {
|
genotypes.add(myAlleles);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addBinaryGenotypeEntry(int genoTYPE) {
|
||||||
String[] alleleStr = new String[2];
|
String[] alleleStr = new String[2];
|
||||||
if ( genoTYPE == 0 ) {
|
if ( genoTYPE == 0 ) {
|
||||||
alleleStr[0] = locAllele1;
|
alleleStr[0] = locAllele1;
|
||||||
|
|
@ -540,7 +547,7 @@ class PlinkVariantInfo implements Comparable {
|
||||||
alleleStr[1] = "0";
|
alleleStr[1] = "0";
|
||||||
}
|
}
|
||||||
|
|
||||||
addGenotypeEntry(alleleStr, sampleName);
|
addGenotypeEntry(alleleStr);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int compareTo(Object obj) {
|
public int compareTo(Object obj) {
|
||||||
|
|
@ -550,6 +557,7 @@ class PlinkVariantInfo implements Comparable {
|
||||||
|
|
||||||
return loc.compareTo(((PlinkVariantInfo) obj).getLocation());
|
return loc.compareTo(((PlinkVariantInfo) obj).getLocation());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class PlinkBinaryTrifecta {
|
class PlinkBinaryTrifecta {
|
||||||
|
|
|
||||||
|
|
@ -391,23 +391,26 @@ public class VariantContextAdaptors {
|
||||||
|
|
||||||
Set<Genotype> genotypes = new HashSet<Genotype>();
|
Set<Genotype> genotypes = new HashSet<Genotype>();
|
||||||
|
|
||||||
Map<String, List<byte[]>> genotypeSets = plink.getGenotypes();
|
Map<String, Allele[]> genotypeSets = plink.getGenotypes();
|
||||||
// for each sample
|
|
||||||
for ( Map.Entry<String, List<byte[]>> genotype : genotypeSets.entrySet() ) {
|
// We need to iterate through this list and recreate the Alleles since the
|
||||||
|
// PlinkRod does not promise to annotate any of the Alleles as reference
|
||||||
|
// for each sample...
|
||||||
|
for ( Map.Entry<String, Allele[]> genotype : genotypeSets.entrySet() ) {
|
||||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>(2);
|
ArrayList<Allele> myAlleles = new ArrayList<Allele>(2);
|
||||||
|
|
||||||
// for each allele
|
// for each allele...
|
||||||
for ( byte[] alleleString : genotype.getValue() ) {
|
for ( Allele myAllele : genotype.getValue() ) {
|
||||||
Allele allele;
|
Allele allele;
|
||||||
if ( Allele.wouldBeNoCallAllele(alleleString) ) {
|
if ( myAllele.isNoCall() ) {
|
||||||
allele = Allele.NO_CALL;
|
allele = Allele.NO_CALL;
|
||||||
} else {
|
} else {
|
||||||
if ( !plink.isIndel() ) {
|
if ( !plink.isIndel() ) {
|
||||||
allele = new Allele(alleleString, refAllele.basesMatch(alleleString));
|
allele = new Allele(myAllele.getBases(), refAllele.equals(myAllele, true));
|
||||||
} else if ( Allele.wouldBeNullAllele(alleleString) ) {
|
} else if ( myAllele.isNull() ) {
|
||||||
allele = new Allele(alleleString, plink.isInsertion());
|
allele = new Allele(Allele.NULL_ALLELE_STRING, plink.isInsertion());
|
||||||
} else {
|
} else {
|
||||||
allele = new Allele(alleleString, !plink.isInsertion());
|
allele = new Allele(myAllele.getBases(), !plink.isInsertion());
|
||||||
}
|
}
|
||||||
|
|
||||||
alleles.add(allele);
|
alleles.add(allele);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue