A first draft of scripts and LaTeX templates required to automatically generate slides for the 1,000 Genomes Automated Data Processing Report.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3652 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
bf5cbad04c
commit
b2127e59c4
|
|
@ -0,0 +1,16 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use lib "$ENV{'STING_DIR'}/perl";
|
||||||
|
use StingArgs;
|
||||||
|
|
||||||
|
my %args = &getCommandArguments("DIR" => undef, "OUT" => undef);
|
||||||
|
|
||||||
|
my $outfile = $args{'OUT'};
|
||||||
|
chomp(my @files = qx(find $args{'DIR'} -path \*slide\*.pdf | sort -n));
|
||||||
|
|
||||||
|
print join("\n", @files) . "\n";
|
||||||
|
|
||||||
|
my $cmd = "gs -q -sPAPERSIZE=letter -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -dAutoRotatePages=/All -sOutputFile=$outfile " . join(' ', @files);
|
||||||
|
|
||||||
|
system($cmd);
|
||||||
|
|
@ -0,0 +1,64 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use lib "$ENV{'STING_DIR'}/perl";
|
||||||
|
use StingArgs;
|
||||||
|
use DistributedMake;
|
||||||
|
use DataTable;
|
||||||
|
use Data::Dumper;
|
||||||
|
|
||||||
|
my %args = &getCommandArguments(
|
||||||
|
"OUT_DIR" => undef,
|
||||||
|
"SEQUENCE_INDEX" => "/broad/1KG/DCC/ftp/sequence.index",
|
||||||
|
"ALIGNMENT_INDEX" => "/broad/1KG/DCC/ftp/alignment.index"
|
||||||
|
);
|
||||||
|
|
||||||
|
my @sequenceIndex = &readTable($args{'SEQUENCE_INDEX'}, "header" => 1, "delimiter" => "\t");
|
||||||
|
my @alignmentIndex = &readTable($args{'ALIGNMENT_INDEX'}, "header" => 1, "delimiter" => "\t");
|
||||||
|
|
||||||
|
my %results;
|
||||||
|
|
||||||
|
foreach my $sequenceEntry (@sequenceIndex) {
|
||||||
|
my %sequenceEntry = %{$sequenceEntry};
|
||||||
|
|
||||||
|
if ($sequenceEntry{'STUDY_NAME'} =~ /1000/ && $sequenceEntry{'ANALYSIS_GROUP'} =~ /low coverage/ && $sequenceEntry{'FASTQ_FILE'} !~ /_\d.filt.fastq.gz/) {
|
||||||
|
${${$results{'all'}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++;
|
||||||
|
${${$results{$sequenceEntry{'STUDY_ID'}}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++;
|
||||||
|
|
||||||
|
${$results{'all'}}{'population'} = "all";
|
||||||
|
${$results{$sequenceEntry{'STUDY_ID'}}}{'population'} = $sequenceEntry{'POPULATION'};
|
||||||
|
|
||||||
|
${$results{'all'}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0);
|
||||||
|
${$results{$sequenceEntry{'STUDY_ID'}}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0);
|
||||||
|
|
||||||
|
${$results{'all'}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0);
|
||||||
|
${$results{$sequenceEntry{'STUDY_ID'}}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0);
|
||||||
|
|
||||||
|
if ($sequenceEntry{'WITHDRAWN'} == 0) {
|
||||||
|
${${${$results{'all'}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++;
|
||||||
|
${${${$results{$sequenceEntry{'STUDY_ID'}}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++;
|
||||||
|
} else {
|
||||||
|
${${$results{'all'}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++;
|
||||||
|
${${$results{$sequenceEntry{'STUDY_ID'}}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++;
|
||||||
|
}
|
||||||
|
|
||||||
|
${${$results{'all'}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++;
|
||||||
|
${${$results{$sequenceEntry{'STUDY_ID'}}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++;
|
||||||
|
|
||||||
|
${${$results{'all'}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++;
|
||||||
|
${${$results{$sequenceEntry{'STUDY_ID'}}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++;
|
||||||
|
|
||||||
|
${${$results{'all'}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++;
|
||||||
|
${${$results{$sequenceEntry{'STUDY_ID'}}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach my $studyid (keys(%results)) {
|
||||||
|
my %study = %{${${$results{$studyid}}{'lanes_per_samples'}}};
|
||||||
|
my @samples = keys(%study);
|
||||||
|
my $numSamples = scalar(@samples);
|
||||||
|
|
||||||
|
${$results{$studyid}}{'num_samples'} = $numSamples;
|
||||||
|
}
|
||||||
|
|
||||||
|
print Dumper(\%results);
|
||||||
|
|
@ -0,0 +1,94 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use lib "$ENV{'STING_DIR'}/perl";
|
||||||
|
use StingArgs;
|
||||||
|
use DistributedMake;
|
||||||
|
use DataTable;
|
||||||
|
use Data::Dumper;
|
||||||
|
use File::Basename;
|
||||||
|
|
||||||
|
my %args = &getCommandArguments(
|
||||||
|
"LIST" => undef,
|
||||||
|
"OUT_DIR" => undef,
|
||||||
|
|
||||||
|
"REFERENCE" => "/broad/1KG/reference/human_b36_both.fasta",
|
||||||
|
"DBSNP" => "/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod",
|
||||||
|
|
||||||
|
"SEQUENCE_INDEX" => "/broad/1KG/DCC/ftp/sequence.index",
|
||||||
|
"ALIGNMENT_INDEX" => "/broad/1KG/DCC/ftp/alignment.index",
|
||||||
|
|
||||||
|
"DRY_RUN" => 1,
|
||||||
|
"QUEUE" => "gsa",
|
||||||
|
"NUM_JOBS" => "",
|
||||||
|
"KEEP_GOING" => 1,
|
||||||
|
);
|
||||||
|
|
||||||
|
my $dm = new DistributedMake("dryRun" => $args{'DRY_RUN'}, "queue" => $args{'QUEUE'}, "numJobs" => "", "keepGoing" => 1, "outputFile" => "$args{'OUT_DIR'}/GenerateDataProcessingReport2.log");
|
||||||
|
|
||||||
|
my @entries;
|
||||||
|
|
||||||
|
open(LIST, $args{'LIST'});
|
||||||
|
while (my $entry = <LIST>) {
|
||||||
|
chomp($entry);
|
||||||
|
|
||||||
|
my %entry;
|
||||||
|
($entry{'POP'}, $entry{'UG'}, $entry{'QCALL'}) = split(/\s+/, $entry);
|
||||||
|
|
||||||
|
push(@entries, \%entry);
|
||||||
|
}
|
||||||
|
close(LIST);
|
||||||
|
|
||||||
|
my @slides;
|
||||||
|
|
||||||
|
foreach my $entry (@entries) {
|
||||||
|
my %entry = %{$entry};
|
||||||
|
|
||||||
|
# Combine VCFs
|
||||||
|
my $vcfCombineOut = "$args{'OUT_DIR'}/intermediate/$entry{'POP'}.combined.vcf";
|
||||||
|
my $vcfCombineOutCmd = "java -Xmx2048m -jar $ENV{'STING_DIR'}/dist/GenomeAnalysisTK.jar -T VCFCombine -R $args{'REFERENCE'} -type UNION -B UG,VCF,$entry{'UG'} -B QCALL,VCF,$entry{'QCALL'} -A -priority 'UG,QCALL' -o $vcfCombineOut -l INFO -L 1";
|
||||||
|
$dm->addRule($vcfCombineOut, [$entry{'UG'}, $entry{'QCALL'}], $vcfCombineOutCmd);
|
||||||
|
|
||||||
|
# Evaluate variants
|
||||||
|
my $evalRoot = "$vcfCombineOut.eval/eval";
|
||||||
|
my $evalCounts = "$evalRoot.Count_Variants.csv";
|
||||||
|
my $evalCmd = "java -Xmx2048m -jar $ENV{'STING_DIR'}/dist/GenomeAnalysisTK.jar -T VariantEval -R $args{'REFERENCE'} -D $args{'DBSNP'} -B eval,VCF,$vcfCombineOut -select 'set==\"Intersection\"' -selectName Intersection -select 'set==\"UG-filteredInOther\"' -selectName InUG-FilteredInQCALL -select 'set==\"QCALL-filteredInOther\"' -selectName InQCALL-FilteredInUG -select 'set==\"UG\"' -selectName UG -select 'set==\"QCALL\"' -selectName QCALL -reportType R -reportLocation $evalRoot -l INFO -L 1";
|
||||||
|
$dm->addRule($evalCounts, $vcfCombineOut, $evalCmd);
|
||||||
|
|
||||||
|
# Get SNPs per sample
|
||||||
|
my $snpsPerSample = "$evalRoot.persample.table";
|
||||||
|
my $snpsPerSampleCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/GetSNPsPerSample.pl VCF=$vcfCombineOut OUT=$snpsPerSample";
|
||||||
|
$dm->addRule($snpsPerSample, $vcfCombineOut, $snpsPerSampleCmd);
|
||||||
|
|
||||||
|
# Create topsheet slide
|
||||||
|
my $slideDir = "$args{'OUT_DIR'}/intermediate/$entry{'POP'}/slides";
|
||||||
|
my $topSheetRoot = "$slideDir/$entry{'POP'}.000.topsheet";
|
||||||
|
my $topSheetSlide = "$topSheetRoot.pdf";
|
||||||
|
my $topSheetSlideCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/MakeTopSheetSlide.pl VCF=$vcfCombineOut SEQUENCE_INDEX=$args{'SEQUENCE_INDEX'} ALIGNMENT_INDEX=$args{'ALIGNMENT_INDEX'} EVAL_ROOT=$evalRoot OUT=$topSheetRoot";
|
||||||
|
#$dm->addRule($topSheetSlide, [$vcfCombineOut, $args{'SEQUENCE_INDEX'}, $args{'ALIGNMENT_INDEX'}, $evalCounts], $topSheetSlideCmd);
|
||||||
|
|
||||||
|
# Create Venn diagram slide of variants
|
||||||
|
my $vennRoot = "$slideDir/$entry{'POP'}.010.venn";
|
||||||
|
my $vennPdf = "$vennRoot.pdf";
|
||||||
|
my $vennPdfCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/MakeVennSlide.pl EVAL_ROOT=$evalRoot OUT_ROOT=$vennRoot POP=$entry{'POP'}";
|
||||||
|
$dm->addRule($vennPdf, $evalCounts, $vennPdfCmd);
|
||||||
|
|
||||||
|
# Create plots for other slides
|
||||||
|
my $plotRoot = "$evalRoot.plot";
|
||||||
|
my $plotDummy = "$plotRoot.venn_per_ac.norm.pdf";
|
||||||
|
my $plotCmd = "Rscript $ENV{'STING_DIR'}/perl/reports/1KG/MakeDataProcessingReportPlots.R $evalRoot $plotRoot $entry{'POP'}";
|
||||||
|
$dm->addRule($plotDummy, [$evalCounts, $snpsPerSample], $plotCmd, 'queue' => "");
|
||||||
|
|
||||||
|
# Create other slides
|
||||||
|
my $dprSlideDummy = "$slideDir/$entry{'POP'}.050.eval.plot.titv.pdf";
|
||||||
|
my $dprSlidesCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/MakeDPRSlides.pl PLOT_ROOT=$plotRoot OUT_DIR=$slideDir POP=$entry{'POP'}";
|
||||||
|
$dm->addRule($dprSlideDummy, $plotDummy, $dprSlidesCmd);
|
||||||
|
|
||||||
|
push(@slides, $dprSlideDummy);
|
||||||
|
}
|
||||||
|
|
||||||
|
my $assembledSlides = "$args{'OUT_DIR'}/dpr.pdf";
|
||||||
|
my $assembledSlidesCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/AssembleSlides.pl DIR=$args{'OUT_DIR'} OUT=$assembledSlides";
|
||||||
|
$dm->addRule($assembledSlides, \@slides, $assembledSlidesCmd);
|
||||||
|
|
||||||
|
$dm->execute();
|
||||||
|
|
@ -0,0 +1,67 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use lib "$ENV{'STING_DIR'}/perl";
|
||||||
|
use StingArgs;
|
||||||
|
|
||||||
|
my %args = &getCommandArguments("VCF" => undef, "OUT" => "/dev/stdout");
|
||||||
|
|
||||||
|
my %table;
|
||||||
|
my %sets;
|
||||||
|
|
||||||
|
open(VCF, $args{'VCF'});
|
||||||
|
|
||||||
|
my @header;
|
||||||
|
while (my $line = <VCF>) {
|
||||||
|
chomp($line);
|
||||||
|
if ($line =~ /#/) {
|
||||||
|
if ($line =~ /CHROM/) {
|
||||||
|
$line =~ s/#//g;
|
||||||
|
@header = split(/\s+/, $line);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
my @columns = split(/\s+/, $line);
|
||||||
|
|
||||||
|
if ($columns[0] eq '1') {
|
||||||
|
my %entry;
|
||||||
|
for (my $i = 0; $i <= $#columns; $i++) {
|
||||||
|
$entry{$header[$i]} = $columns[$i];
|
||||||
|
}
|
||||||
|
|
||||||
|
my ($set) = $entry{'INFO'} =~ /set=(\w+)/;
|
||||||
|
$sets{$set} = 1;
|
||||||
|
|
||||||
|
for (my $i = 9; $i <= $#header; $i++) {
|
||||||
|
if ($entry{$header[$i]} =~ /0[\\\|\/]1/ ||
|
||||||
|
$entry{$header[$i]} =~ /1[\\\|\/]0/ ||
|
||||||
|
$entry{$header[$i]} =~ /1[\\\|\/]1/) {
|
||||||
|
|
||||||
|
${$table{$header[$i]}}{$set}++;
|
||||||
|
$sets{$set} = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close(VCF);
|
||||||
|
|
||||||
|
open(OUT, ">$args{'OUT'}");
|
||||||
|
|
||||||
|
print OUT "sample\t" . join("\t", sort { $a cmp $b } keys(%sets)) . "\n";
|
||||||
|
|
||||||
|
foreach my $sample (sort { $a cmp $b } keys(%table)) {
|
||||||
|
print OUT "$sample";
|
||||||
|
|
||||||
|
foreach my $set (sort { $a cmp $b } keys(%sets)) {
|
||||||
|
my $value = 0;
|
||||||
|
if (exists(${$table{$sample}}{$set})) {
|
||||||
|
$value = ${$table{$sample}}{$set};
|
||||||
|
}
|
||||||
|
|
||||||
|
print OUT "\t$value";
|
||||||
|
}
|
||||||
|
|
||||||
|
print OUT "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
close(OUT);
|
||||||
|
|
@ -0,0 +1,50 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use lib "$ENV{'STING_DIR'}/perl";
|
||||||
|
use StingArgs;
|
||||||
|
use File::Basename;
|
||||||
|
use DistributedMake;
|
||||||
|
|
||||||
|
my %args = &getCommandArguments("PLOT_ROOT" => undef, "OUT_DIR" => undef, "POP" => undef);
|
||||||
|
|
||||||
|
my $dm = new DistributedMake("dryRun" => 0, "numJobs" => "1");
|
||||||
|
|
||||||
|
my $timestamp = gmtime();
|
||||||
|
|
||||||
|
my @items = (
|
||||||
|
{ 'plotprefix' => "$args{'PLOT_ROOT'}.venn_per_ac", 'title' => 'Callset concordance per allele count', 'population' => $args{'POP'}, 'caption' => "Concordance of the UnifiedGenotyper and QCALL callsets, per allele count. Bottom and top sections of each bar represent the calls unique to each caller's callset, while the middle portion represents the intersection.", 'timestamp' => $timestamp },
|
||||||
|
{ 'plotprefix' => "$args{'PLOT_ROOT'}.venn_per_ac.norm", 'title' => 'Normalized callset concordance per allele count', 'population' => $args{'POP'}, 'caption' => "Concordance of the UnifiedGenotyper and QCALL callsets, per allele count, normalized by the number of variants in the callset union for that allele count bin. Bottom and top sections of each bar represent the calls unique to each caller's callset, while the middle portion represents the intersection.", 'timestamp' => $timestamp },
|
||||||
|
{ 'plotprefix' => "$args{'PLOT_ROOT'}.acs", 'title' => 'Allele count spectrum', 'population' => $args{'POP'}, 'caption' => 'Allele count spectrum for all known, novel, and all variants. Individual contributions from separate callers (thin, dotted or dashed lines) as well as the callset intersection (bold, solid lines) are shown.', 'timestamp' => $timestamp },
|
||||||
|
{ 'plotprefix' => "$args{'PLOT_ROOT'}.titv", 'title' => 'Ti/Tv per allele count', 'population' => $args{'POP'}, 'caption' => 'Ratio of transition mutations to transversions, per allele count. Whole-genome expectation is approximately 2.1. Fluctuation at higher allele counts is typically due to fewer available variants for the calculation at those frequencies.', 'timestamp' => $timestamp },
|
||||||
|
{ 'plotprefix' => "$args{'PLOT_ROOT'}.counts_per_sample", 'title' => 'SNP calls per sample', 'population' => $args{'POP'}, 'caption' => 'Number of SNPs per sample. As samples in plot are all from the same or similar populations, with presumably similar heterozygosity, all samples should have roughly consistent performance. Outliers may indicate problematic samples - perhaps in coverage, identity QC (sample mixup), or quality of sequencing.', 'timestamp' => $timestamp },
|
||||||
|
);
|
||||||
|
|
||||||
|
my $idnum = 1;
|
||||||
|
|
||||||
|
foreach my $item (@items) {
|
||||||
|
$idnum++;
|
||||||
|
my $id = "$args{'POP'}." . sprintf("%03d", $idnum*10);
|
||||||
|
|
||||||
|
my %item = %$item;
|
||||||
|
|
||||||
|
my $basename = "$id." . basename($item{'plotprefix'});
|
||||||
|
my $table = "$args{'OUT_DIR'}/$basename.kvp";
|
||||||
|
|
||||||
|
open(TABLE, ">$table");
|
||||||
|
foreach my $key (sort { $a cmp $b } keys(%item)) {
|
||||||
|
print TABLE "$key\t$item{$key}\n";
|
||||||
|
}
|
||||||
|
close(TABLE);
|
||||||
|
|
||||||
|
my $render = "$args{'OUT_DIR'}/$basename.tex";
|
||||||
|
my $renderCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/RenderTemplate.pl KVP=$table TEMPLATE=$ENV{'STING_DIR'}/perl/reports/1KG/plot_and_text_template.tex TEMPLATE_OUT=$render";
|
||||||
|
$dm->addRule($render, "$item{'plotprefix'}.pdf", $renderCmd);
|
||||||
|
|
||||||
|
my $pdf = "$args{'OUT_DIR'}/$basename.pdf";
|
||||||
|
my $pdfdir = dirname($pdf);
|
||||||
|
my $pdfCmd = "pdflatex -output-directory $pdfdir $render";
|
||||||
|
$dm->addRule($pdf, $render, $pdfCmd);
|
||||||
|
}
|
||||||
|
|
||||||
|
$dm->execute();
|
||||||
|
|
@ -0,0 +1,146 @@
|
||||||
|
args = commandArgs(TRUE);
|
||||||
|
|
||||||
|
evalRoot = args[1];
|
||||||
|
if (is.na(evalRoot)) { evalRoot = "results/v2/intermediate/CEU.combined.vcf.eval/eval"; }
|
||||||
|
|
||||||
|
plotRoot = args[2];
|
||||||
|
if (is.na(plotRoot)) { plotRoot = "results/v2/intermediate/CEU.combined.vcf.eval/eval.plot"; }
|
||||||
|
|
||||||
|
population = args[3];
|
||||||
|
if (is.na(population)) { population = "unknown population"; }
|
||||||
|
|
||||||
|
fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep="");
|
||||||
|
fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep="");
|
||||||
|
fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep="");
|
||||||
|
fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep="");
|
||||||
|
fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep="");
|
||||||
|
fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep="");
|
||||||
|
fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep="");
|
||||||
|
fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep="");
|
||||||
|
fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep="");
|
||||||
|
fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep="");
|
||||||
|
fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep="");
|
||||||
|
fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep="");
|
||||||
|
filePerSample = paste(evalRoot, ".persample.table", sep="");
|
||||||
|
|
||||||
|
# Allele count spectrum
|
||||||
|
if (!is.na(plotRoot)) {
|
||||||
|
pdf(paste(plotRoot, ".acs.pdf", sep=""));
|
||||||
|
} else {
|
||||||
|
x11();
|
||||||
|
}
|
||||||
|
|
||||||
|
dataMetricsByAc = read.csv(fileMetricsByAc, header=TRUE, comment.char="#");
|
||||||
|
|
||||||
|
dataMetricsByAc.none.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "none" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),];
|
||||||
|
|
||||||
|
dataMetricsByAc.int.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "Intersection" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),];
|
||||||
|
dataMetricsByAc.int.known = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "Intersection" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "known"),];
|
||||||
|
dataMetricsByAc.int.novel = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "Intersection" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "novel"),];
|
||||||
|
|
||||||
|
dataMetricsByAc.UG.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "UG" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),] + dataMetricsByAc.int.all;
|
||||||
|
dataMetricsByAc.UG.known = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "UG" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "known"),] + dataMetricsByAc.int.known;
|
||||||
|
dataMetricsByAc.UG.novel = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "UG" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "novel"),] + dataMetricsByAc.int.novel;
|
||||||
|
|
||||||
|
dataMetricsByAc.QCALL.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "QCALL" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),] + dataMetricsByAc.int.all;
|
||||||
|
dataMetricsByAc.QCALL.known = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "QCALL" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "known"),] + dataMetricsByAc.int.known;
|
||||||
|
dataMetricsByAc.QCALL.novel = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "QCALL" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "novel"),] + dataMetricsByAc.int.novel;
|
||||||
|
|
||||||
|
|
||||||
|
plot(0, 0, type="n", xlab="Allele count", ylab="Number of variants", xlim=c(0, max(dataMetricsByAc.none.all$AC)), ylim=c(min(dataMetricsByAc.none.all$n), max(dataMetricsByAc.none.all$n)), main=paste("Allele count spectrum (", population, ")", sep=""), cex=1.2, cex.axis=1.2, cex.lab=1.2, bty="n");
|
||||||
|
|
||||||
|
points(dataMetricsByAc.int.all$AC, dataMetricsByAc.int.all$n, type="l", lwd=3, col="black");
|
||||||
|
points(dataMetricsByAc.int.known$AC, dataMetricsByAc.int.known$n, type="l", lwd=3, col="blue");
|
||||||
|
points(dataMetricsByAc.int.novel$AC, dataMetricsByAc.int.novel$n, type="l", lwd=3, col="red");
|
||||||
|
|
||||||
|
points(dataMetricsByAc.UG.all$AC, dataMetricsByAc.UG.all$n, type="l", lwd=1, lty=2, col="black");
|
||||||
|
points(dataMetricsByAc.UG.known$AC, dataMetricsByAc.UG.known$n, type="l", lwd=1, lty=2, col="blue");
|
||||||
|
points(dataMetricsByAc.UG.novel$AC, dataMetricsByAc.UG.novel$n, type="l", lwd=1, lty=2, col="red");
|
||||||
|
|
||||||
|
points(dataMetricsByAc.QCALL.all$AC, dataMetricsByAc.QCALL.all$n, type="l", lwd=1, lty=3, col="black");
|
||||||
|
points(dataMetricsByAc.QCALL.known$AC, dataMetricsByAc.QCALL.known$n, type="l", lwd=1, lty=3, col="blue");
|
||||||
|
points(dataMetricsByAc.QCALL.novel$AC, dataMetricsByAc.QCALL.novel$n, type="l", lwd=1, lty=3, col="red");
|
||||||
|
|
||||||
|
legend("topright", c("Intersection: all", "Intersection: known (present in dbSNP 129)", "Intersection: novel (absent in dbSNP 129)", "UnifiedGenotyper: all", "UnifiedGenotyper: known (present in dbSNP 129)", "UnifiedGenotyper: novel (absent in dbSNP 129)", "QCALL: all", "QCALL: known (present in dbSNP 129)", "QCALL: novel (absent in dbSNP 129)"), lwd=c(3, 3, 3, 1, 1, 1, 1, 1, 1), lty=c(1, 1, 1, 2, 2, 2, 3, 3, 3), col=c("black", "blue", "red"), cex=1.2);
|
||||||
|
|
||||||
|
if (!is.na(plotRoot)) { dev.off(); }
|
||||||
|
|
||||||
|
# Ti/Tv per allele count
|
||||||
|
if (!is.na(plotRoot)) {
|
||||||
|
pdf(paste(plotRoot, ".titv.pdf", sep=""));
|
||||||
|
} else {
|
||||||
|
x11();
|
||||||
|
}
|
||||||
|
|
||||||
|
plot(0, 0, type="n", xlab="Allele count", ylab="Transition/transversion ratio", xlim=c(0, max(dataMetricsByAc.none.all$AC)), ylim=c(0, max(dataMetricsByAc$Ti.Tv)), main=paste("Ti/Tv per allele count (", population, ")", sep=""), cex=1.2, cex.lab=1.2, cex.axis=1.2, bty="n");
|
||||||
|
|
||||||
|
points(dataMetricsByAc.int.all$AC, dataMetricsByAc.int.all$Ti.Tv, type="l", lwd=3, lty=1, col="black");
|
||||||
|
points(dataMetricsByAc.int.known$AC, dataMetricsByAc.int.known$Ti.Tv, type="l", lwd=3, lty=1, col="blue");
|
||||||
|
points(dataMetricsByAc.int.novel$AC, dataMetricsByAc.int.novel$Ti.Tv, type="l", lwd=3, lty=1, col="red");
|
||||||
|
|
||||||
|
points(dataMetricsByAc.UG.all$AC, dataMetricsByAc.int.all$Ti.Tv, type="l", lwd=1, lty=2, col="black");
|
||||||
|
points(dataMetricsByAc.UG.known$AC, dataMetricsByAc.int.known$Ti.Tv, type="l", lwd=1, lty=2, col="blue");
|
||||||
|
points(dataMetricsByAc.UG.novel$AC, dataMetricsByAc.int.novel$Ti.Tv, type="l", lwd=1, lty=2, col="red");
|
||||||
|
|
||||||
|
points(dataMetricsByAc.QCALL.all$AC, dataMetricsByAc.int.all$Ti.Tv, type="l", lwd=1, lty=3, col="black");
|
||||||
|
points(dataMetricsByAc.QCALL.known$AC, dataMetricsByAc.int.known$Ti.Tv, type="l", lwd=1, lty=3, col="blue");
|
||||||
|
points(dataMetricsByAc.QCALL.novel$AC, dataMetricsByAc.int.novel$Ti.Tv, type="l", lwd=1, lty=3, col="red");
|
||||||
|
|
||||||
|
legend("topright", c("Intersection: all", "Intersection: known (present in dbSNP 129)", "Intersection: novel (absent in dbSNP 129)", "UnifiedGenotyper: all", "UnifiedGenotyper: known (present in dbSNP 129)", "UnifiedGenotyper: novel (absent in dbSNP 129)", "QCALL: all", "QCALL: known (present in dbSNP 129)", "QCALL: novel (absent in dbSNP 129)"), lwd=c(3, 3, 3, 1, 1, 1, 1, 1, 1), lty=c(1, 1, 1, 2, 2, 2, 3, 3, 3), col=c("black", "blue", "red"), cex=1.2);
|
||||||
|
|
||||||
|
if (!is.na(plotRoot)) { dev.off(); }
|
||||||
|
|
||||||
|
# Barplot (unnormalized)
|
||||||
|
if (!is.na(plotRoot)) {
|
||||||
|
pdf(paste(plotRoot, ".venn_per_ac.pdf", sep=""));
|
||||||
|
} else {
|
||||||
|
x11();
|
||||||
|
}
|
||||||
|
|
||||||
|
norm = dataMetricsByAc.UG.all$n + dataMetricsByAc.int.all$n + dataMetricsByAc.QCALL.all$n;
|
||||||
|
mat = rbind(dataMetricsByAc.QCALL.all$n, dataMetricsByAc.int.all$n, dataMetricsByAc.UG.all$n);
|
||||||
|
matnorm = rbind(dataMetricsByAc.QCALL.all$n/norm, dataMetricsByAc.int.all$n/norm, dataMetricsByAc.UG.all$n/norm);
|
||||||
|
|
||||||
|
qcallcolor = "#FF5555";
|
||||||
|
intcolor = "#5582C6";
|
||||||
|
ugcolor = "#55BBFF";
|
||||||
|
|
||||||
|
barplot(mat, col=c(qcallcolor, intcolor, ugcolor), xlab="Allele count", ylab="counts", main=paste("Callset concordance per allele count (", population, ")", sep=""), names.arg=dataMetricsByAc.UG.all$AC);
|
||||||
|
legend("topright", c("UG-only", "Intersection", "QCALL-only"), fill=c(ugcolor, intcolor, qcallcolor));
|
||||||
|
|
||||||
|
if (!is.na(plotRoot)) { dev.off(); }
|
||||||
|
|
||||||
|
# Barplot (normalized)
|
||||||
|
if (!is.na(plotRoot)) {
|
||||||
|
pdf(paste(plotRoot, ".venn_per_ac.norm.pdf", sep=""));
|
||||||
|
} else {
|
||||||
|
x11();
|
||||||
|
}
|
||||||
|
|
||||||
|
barplot(matnorm, col=c(qcallcolor, intcolor, ugcolor), xlab="Allele count", ylab="Fraction", main=paste("Callset concordance per allele count (", population, ")", sep=""), names.arg=dataMetricsByAc.UG.all$AC, ylim=c(0, 1.3));
|
||||||
|
legend("topright", c("UG-only", "Intersection", "QCALL-only"), fill=c(ugcolor, intcolor, qcallcolor));
|
||||||
|
|
||||||
|
if (!is.na(plotRoot)) { dev.off(); }
|
||||||
|
|
||||||
|
# Per-sample
|
||||||
|
if (!is.na(plotRoot)) {
|
||||||
|
pdf(paste(plotRoot, ".counts_per_sample.pdf", sep=""), width=10);
|
||||||
|
} else {
|
||||||
|
x11(width=10);
|
||||||
|
}
|
||||||
|
|
||||||
|
dataPerSample = read.table(filePerSample, header=TRUE);
|
||||||
|
|
||||||
|
par(mar=c(7, 5, 2, 1));
|
||||||
|
|
||||||
|
plot(0, 0, type="n", xaxt="n", xlim=c(1, length(dataPerSample$sample)), ylim=c(0, max(dataPerSample$Intersection + dataPerSample$UG, dataPerSample$Intersection, dataPerSample$Intersection + dataPerSample$QCALL)), xlab="", ylab="Counts", main=paste("SNPs per sample (", population, ")", sep=""), cex=1.2, cex.lab=1.2, cex.axis=1.2);
|
||||||
|
|
||||||
|
points(dataPerSample$sample, dataPerSample$Intersection + dataPerSample$UG, col=ugcolor);
|
||||||
|
points(dataPerSample$sample, dataPerSample$Intersection, col=intcolor);
|
||||||
|
points(dataPerSample$sample, dataPerSample$Intersection + dataPerSample$QCALL, col=qcallcolor);
|
||||||
|
|
||||||
|
legend("bottomleft", c("UG", "Intersection", "QCALL"), fill=c(ugcolor, intcolor, qcallcolor));
|
||||||
|
|
||||||
|
axis(1, at=c(1:length(dataPerSample$sample)), labels=dataPerSample$sample, las=2, cex.axis=0.7);
|
||||||
|
|
||||||
|
if (!is.na(plotRoot)) { dev.off(); }
|
||||||
|
|
@ -0,0 +1,69 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use lib "$ENV{'STING_DIR'}/perl";
|
||||||
|
use StingArgs;
|
||||||
|
use DistributedMake;
|
||||||
|
use DataTable;
|
||||||
|
use Data::Dumper;
|
||||||
|
|
||||||
|
my %args = &getCommandArguments(
|
||||||
|
"VCF" => undef,
|
||||||
|
"EVAL_ROOT" => undef,
|
||||||
|
"OUT" => undef,
|
||||||
|
|
||||||
|
"POP" => undef,
|
||||||
|
"SRP_IDS" => undef,
|
||||||
|
"SEQUENCE_INDEX" => "/broad/1KG/DCC/ftp/sequence.index",
|
||||||
|
"ALIGNMENT_INDEX" => "/broad/1KG/DCC/ftp/alignment.index"
|
||||||
|
);
|
||||||
|
|
||||||
|
my @sequenceIndex = &readTable($args{'SEQUENCE_INDEX'}, "header" => 1, "delimiter" => "\t");
|
||||||
|
my @alignmentIndex = &readTable($args{'ALIGNMENT_INDEX'}, "header" => 1, "delimiter" => "\t");
|
||||||
|
|
||||||
|
my %results;
|
||||||
|
|
||||||
|
foreach my $sequenceEntry (@sequenceIndex) {
|
||||||
|
my %sequenceEntry = %{$sequenceEntry};
|
||||||
|
|
||||||
|
if ($sequenceEntry{'STUDY_NAME'} =~ /1000/ && $sequenceEntry{'ANALYSIS_GROUP'} =~ /low coverage/ && $sequenceEntry{'FASTQ_FILE'} !~ /_\d.filt.fastq.gz/) {
|
||||||
|
${${$results{'all'}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++;
|
||||||
|
${${$results{$sequenceEntry{'STUDY_ID'}}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++;
|
||||||
|
|
||||||
|
${$results{'all'}}{'population'} = "all";
|
||||||
|
${$results{$sequenceEntry{'STUDY_ID'}}}{'population'} = $sequenceEntry{'POPULATION'};
|
||||||
|
|
||||||
|
${$results{'all'}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0);
|
||||||
|
${$results{$sequenceEntry{'STUDY_ID'}}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0);
|
||||||
|
|
||||||
|
${$results{'all'}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0);
|
||||||
|
${$results{$sequenceEntry{'STUDY_ID'}}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0);
|
||||||
|
|
||||||
|
if ($sequenceEntry{'WITHDRAWN'} == 0) {
|
||||||
|
${${${$results{'all'}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++;
|
||||||
|
${${${$results{$sequenceEntry{'STUDY_ID'}}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++;
|
||||||
|
} else {
|
||||||
|
${${$results{'all'}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++;
|
||||||
|
${${$results{$sequenceEntry{'STUDY_ID'}}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++;
|
||||||
|
}
|
||||||
|
|
||||||
|
${${$results{'all'}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++;
|
||||||
|
${${$results{$sequenceEntry{'STUDY_ID'}}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++;
|
||||||
|
|
||||||
|
${${$results{'all'}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++;
|
||||||
|
${${$results{$sequenceEntry{'STUDY_ID'}}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++;
|
||||||
|
|
||||||
|
${${$results{'all'}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++;
|
||||||
|
${${$results{$sequenceEntry{'STUDY_ID'}}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach my $studyid (keys(%results)) {
|
||||||
|
my %study = %{${${$results{$studyid}}{'lanes_per_samples'}}};
|
||||||
|
my @samples = keys(%study);
|
||||||
|
my $numSamples = scalar(@samples);
|
||||||
|
|
||||||
|
${$results{$studyid}}{'num_samples'} = $numSamples;
|
||||||
|
}
|
||||||
|
|
||||||
|
print Dumper(\%results);
|
||||||
|
|
@ -0,0 +1,93 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use lib "$ENV{'STING_DIR'}/perl";
|
||||||
|
use StingArgs;
|
||||||
|
use DistributedMake;
|
||||||
|
use Data::Dumper;
|
||||||
|
use File::Basename;
|
||||||
|
|
||||||
|
sub getCSV {
|
||||||
|
my ($file) = @_;
|
||||||
|
|
||||||
|
my @header;
|
||||||
|
my @csv;
|
||||||
|
|
||||||
|
open(CSV, $file);
|
||||||
|
while (my $csvline = <CSV>) {
|
||||||
|
if ($csvline !~ /#/) {
|
||||||
|
chomp(my @columns = split(/,/, $csvline));
|
||||||
|
|
||||||
|
if (scalar(@columns) > 1) {
|
||||||
|
if (scalar(@header) < 2) {
|
||||||
|
@header = @columns;
|
||||||
|
} else {
|
||||||
|
my %entry;
|
||||||
|
for (my $column = 0; $column <= $#header; $column++) {
|
||||||
|
$entry{$header[$column]} = $columns[$column];
|
||||||
|
}
|
||||||
|
|
||||||
|
push(@csv, \%entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close(CSV);
|
||||||
|
|
||||||
|
return @csv;
|
||||||
|
}
|
||||||
|
|
||||||
|
my %args = &getCommandArguments("EVAL_ROOT" => undef, "OUT_ROOT" => undef, "POP" => undef);
|
||||||
|
|
||||||
|
my %values;
|
||||||
|
|
||||||
|
my @header;
|
||||||
|
my @counts = &getCSV("$args{'EVAL_ROOT'}.Count_Variants.csv");
|
||||||
|
my @titv = &getCSV("$args{'EVAL_ROOT'}.Ti_slash_Tv_Variant_Evaluator.csv");
|
||||||
|
|
||||||
|
my $varout = "$args{'OUT_ROOT'}.kvp";
|
||||||
|
open(VAROUT, ">$varout");
|
||||||
|
foreach my $entry (@counts) {
|
||||||
|
my %entry = %$entry;
|
||||||
|
|
||||||
|
print VAROUT "$entry{'jexl_expression'}.$entry{'filter_name'}.$entry{'novelty_name'}.nVariantLoci\t$entry{'nVariantLoci'}\n";
|
||||||
|
|
||||||
|
if ($entry{'jexl_expression'} eq 'Intersection') { $values{'Intersection'} = $entry{'nVariantLoci'}; }
|
||||||
|
if ($entry{'jexl_expression'} eq 'UG') { $values{'UG-only'} = $entry{'nVariantLoci'}; }
|
||||||
|
if ($entry{'jexl_expression'} eq 'QCALL') { $values{'QCALL-only'} = $entry{'nVariantLoci'}; }
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach my $entry (@titv) {
|
||||||
|
my %entry = %$entry;
|
||||||
|
|
||||||
|
print VAROUT "$entry{'jexl_expression'}.$entry{'filter_name'}.$entry{'novelty_name'}.ti/tv_ratio\t$entry{'ti/tv ratio'}\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
my $outpng = "$args{'OUT_ROOT'}.png";
|
||||||
|
print VAROUT "venndiagram\t$args{'OUT_ROOT'}\n";
|
||||||
|
print VAROUT "population\t$args{'POP'}\n";
|
||||||
|
print VAROUT "timestamp\t" . gmtime() . "\n";
|
||||||
|
close(VAROUT);
|
||||||
|
|
||||||
|
my $ug = $values{'UG-only'} + $values{'Intersection'};
|
||||||
|
my $qcall = $values{'QCALL-only'} + $values{'Intersection'};
|
||||||
|
|
||||||
|
my $scaledug = ($ug > $qcall) ? 100 : int($ug*100/$qcall);
|
||||||
|
my $scaledqcall = ($qcall > $ug) ? 100 : int($qcall*100/$ug);
|
||||||
|
my $scaledint = ($ug > $qcall) ? int($values{'Intersection'}*100/$ug) : int($values{'Intersection'}*100/$qcall);
|
||||||
|
|
||||||
|
my $dm = new DistributedMake("dryRun" => 0);
|
||||||
|
|
||||||
|
my $vennCmd = "wget -O $outpng \"http://chart.apis.google.com/chart?cht=v&chd=t:$scaledqcall,$scaledug,0,$scaledint&chco=FF3355,00BBFF&chs=600x480&chdl=QCALL|UnifiedGenotyper&chtt=Callset+concordance+($args{'POP'})&chts=000000,20&chma=100,0,0,0\"";
|
||||||
|
$dm->addRule($outpng, $varout, $vennCmd);
|
||||||
|
|
||||||
|
my $render = "$args{'OUT_ROOT'}.tex";
|
||||||
|
my $renderCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/RenderTemplate.pl KVP=$varout TEMPLATE=$ENV{'STING_DIR'}/perl/reports/1KG/venn_template.tex TEMPLATE_OUT=$render";
|
||||||
|
$dm->addRule($render, $outpng, $renderCmd);
|
||||||
|
|
||||||
|
my $pdf = "$args{'OUT_ROOT'}.pdf";
|
||||||
|
my $pdfdir = dirname($args{'OUT_ROOT'});
|
||||||
|
my $pdfCmd = "pdflatex -output-directory $pdfdir $render";
|
||||||
|
$dm->addRule($pdf, $render, $pdfCmd);
|
||||||
|
|
||||||
|
$dm->execute();
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use lib "$ENV{'STING_DIR'}/perl";
|
||||||
|
use StingArgs;
|
||||||
|
use Data::Dumper;
|
||||||
|
|
||||||
|
my %args = &getCommandArguments("KVP" => undef, "TEMPLATE" => undef, "TEMPLATE_OUT" => undef);
|
||||||
|
|
||||||
|
my %vars;
|
||||||
|
open(KVP, $args{'KVP'});
|
||||||
|
while (my $kvpline = <KVP>) {
|
||||||
|
chomp($kvpline);
|
||||||
|
|
||||||
|
my ($key, $value) = split(/\t+/, $kvpline);
|
||||||
|
$vars{$key} = $value;
|
||||||
|
}
|
||||||
|
close(KVP);
|
||||||
|
|
||||||
|
open(TEMPLATE, $args{'TEMPLATE'});
|
||||||
|
open(TEMPLATE_OUT, ">$args{'TEMPLATE_OUT'}");
|
||||||
|
while (my $line = <TEMPLATE>) {
|
||||||
|
chomp($line);
|
||||||
|
|
||||||
|
while ($line =~ /\$(.+?)\$/) {
|
||||||
|
if (exists($vars{$1})) {
|
||||||
|
my ($key, $value) = ($1, $vars{$1});
|
||||||
|
$line =~ s/\$$key\$/$value/;
|
||||||
|
} else {
|
||||||
|
$line =~ s/\$$1\$/unknown/;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print TEMPLATE_OUT "$line\n";
|
||||||
|
}
|
||||||
|
close(TEMPLATE_OUT);
|
||||||
|
close(TEMPLATE);
|
||||||
|
|
@ -0,0 +1,24 @@
|
||||||
|
\documentclass[10pt]{article}
|
||||||
|
|
||||||
|
\usepackage[lmargin=0.3in, rmargin=0.3in, tmargin=0.9in, bmargin=0.9in, paperwidth=10.0in, paperheight=7.5in]{geometry}
|
||||||
|
\usepackage{multicol}
|
||||||
|
\usepackage{graphicx}
|
||||||
|
\usepackage{fancyhdr}
|
||||||
|
|
||||||
|
\pagestyle{fancy}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\lhead{\bfseries{\Large{$title$ - $population$ samples}}}
|
||||||
|
\rhead{}
|
||||||
|
\chead{}
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\includegraphics[height=125mm, type=pdf, ext=.pdf, read=.pdf]{$plotprefix$}
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
$caption$
|
||||||
|
|
||||||
|
\lfoot{}
|
||||||
|
\cfoot{}
|
||||||
|
\rfoot{\small{1,000 Genomes Automated Data Processing Report ($timestamp$)}}
|
||||||
|
\end{document}
|
||||||
|
|
@ -0,0 +1,47 @@
|
||||||
|
\documentclass[10pt]{article}
|
||||||
|
|
||||||
|
\usepackage[lmargin=0.3in, rmargin=0.3in, tmargin=0.9in, bmargin=0.9in, paperwidth=10.0in, paperheight=7.5in]{geometry}
|
||||||
|
\usepackage{multicol}
|
||||||
|
\usepackage{graphicx}
|
||||||
|
\usepackage{fancyhdr}
|
||||||
|
|
||||||
|
\pagestyle{fancy}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\lhead{\bfseries{\Large{Sequencing and variant summary - $population$ samples}}}
|
||||||
|
\rhead{}
|
||||||
|
\chead{}
|
||||||
|
|
||||||
|
\begin{tabular}{|r|l|l|}
|
||||||
|
\multicolumn{3}{l}{\bfseries{Sequencing summary}} \\
|
||||||
|
\hline
|
||||||
|
& Lanes & Aligned Lanes \\
|
||||||
|
\hline
|
||||||
|
All & $allLanes$ & $allAlignedLanes$ \\
|
||||||
|
Passed QC & $passedQCLanes$ & $passedQCAlignedLanes$ \\
|
||||||
|
Failed Genotype QC & $failedGenotypeQCLanes$ & $failedGenotypeQCAlignedLanes$ \\
|
||||||
|
Failed one of the DCC processes & $failedDCCProcessLanes$ & $failedDCCProcessAlignedLanes$ \\
|
||||||
|
Not yet available from archive & $failedUnavailableLanes$ & $failedUnavailableAlignedLanes$ \\
|
||||||
|
Suppressed in archive & $failedSuppressedLanes$ & $failedSuppressedAlignedLanes$ \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
|
||||||
|
\begin{tabular}{|r|l|l|}
|
||||||
|
\multicolumn{3}{l}{\bfseries{Variant summary}} \\
|
||||||
|
\hline
|
||||||
|
& UnifiedGenotyper & QCALL \\
|
||||||
|
\hline
|
||||||
|
Called samples & $ugSamples$ & $qcallSamples$ \\
|
||||||
|
SNPs (all) & $ugAllSNPs$ & $qcallAllSNPs$ \\
|
||||||
|
Ti/Tv (all) & $ugAllTiTv$ & $qcallAllTiTv$ \\
|
||||||
|
SNPs (known) & $ugKnownSNPs$ & $qcallKnownSNPs$ \\
|
||||||
|
Ti/Tv (known) & $ugKnownTiTv$ & $qcallKnownTiTv$ \\
|
||||||
|
SNPs (novel) & $ugNovelSNPs$ & $qcallNovelSNPs$ \\
|
||||||
|
Ti/Tv (novel) & $ugNovelTiTv$ & $qcallNovelTiTv$ \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
|
||||||
|
\lfoot{}
|
||||||
|
\cfoot{}
|
||||||
|
\rfoot{\small{1,000 Genomes Automated Data Processing Report ($timestamp$)}}
|
||||||
|
\end{document}
|
||||||
|
|
@ -0,0 +1,65 @@
|
||||||
|
\documentclass[10pt]{article}
|
||||||
|
|
||||||
|
\usepackage[margin=1in, paperwidth=10.0in, paperheight=7.5in]{geometry}
|
||||||
|
\usepackage{multicol}
|
||||||
|
\usepackage{fancyhdr}
|
||||||
|
\pagestyle{fancy}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\lhead{\bfseries{\Large{1,000 Genomes Data Processing Report - $population$ samples}}}
|
||||||
|
\rhead{}
|
||||||
|
\chead{}
|
||||||
|
|
||||||
|
\begin{multicols}{2}
|
||||||
|
\begin{tabular}{|r|l|}
|
||||||
|
\multicolumn{2}{l}{\bfseries{Project Summary ($study_id$)}} \\
|
||||||
|
\hline
|
||||||
|
Used samples & $used_samples$ \\
|
||||||
|
Unused samples & $unused_samples$ \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
|
||||||
|
\begin{tabular}{|r|l|}
|
||||||
|
\multicolumn{2}{l}{\bfseries{Sequencing Summary}} \\
|
||||||
|
\hline
|
||||||
|
Sequencers & $sequencers$ \\
|
||||||
|
Used lanes & $used_lanes$ \\
|
||||||
|
Unused lanes & $unused_lanes$ \\
|
||||||
|
Used lanes/sample & $used_lanes_per_sample$ \\
|
||||||
|
Lane parities & $lane_parities$ \\
|
||||||
|
Read lengths & $read_lengths$ \\
|
||||||
|
Sequencing dates & $sequencing_dates$ \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
|
||||||
|
\begin{tabular}{|r|l|l|}
|
||||||
|
\multicolumn{3}{l}{\bfseries{Bases Summary (excluding unused lanes/samples)}} \\
|
||||||
|
\hline
|
||||||
|
& Per lane & Per sample \\
|
||||||
|
\hline
|
||||||
|
Reads & $reads_per_lane$ & $reads_per_sample$ \\
|
||||||
|
Used bases & $used_bases_per_lane$ & $used_bases_per_sample$ \\
|
||||||
|
Coverage & $coverage_per_lane$ & $coverage_per_sample$ \\
|
||||||
|
\% loci $>$ 1x covered & $loci_gt_1x_per_lane$ & $loci_gt_1x_per_sample$ \\
|
||||||
|
\% loci $>$ 2x covered & $loci_gt_2x_per_lane$ & $loci_gt_2x_per_sample$ \\
|
||||||
|
\% loci $>$ 3x covered & $loci_gt_3x_per_lane$ & $loci_gt_3x_per_sample$ \\
|
||||||
|
\% loci $>$ 4x covered & $loci_gt_4x_per_lane$ & $loci_gt_4x_per_sample$ \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
|
||||||
|
\begin{tabular}{|r|l|l|}
|
||||||
|
\multicolumn{3}{l}{\bfseries{Variant Summary}} \\
|
||||||
|
\hline
|
||||||
|
& Found & Est. FP rate \\
|
||||||
|
\hline
|
||||||
|
All SNPs & $all_snps_found$ & $all_snps_fprate$ \\
|
||||||
|
Known SNPs & $known_snps_found$ & $known_snps_fprate$ \\
|
||||||
|
Novel SNPs & $novel_snps_found$ & $novel_snps_fprate$ \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\end{multicols}
|
||||||
|
|
||||||
|
%\lfoot{kiran@broadinstitute.org}
|
||||||
|
%\cfoot{}
|
||||||
|
%\rfoot{}
|
||||||
|
\end{document}
|
||||||
|
|
@ -0,0 +1,62 @@
|
||||||
|
\documentclass[10pt]{article}
|
||||||
|
|
||||||
|
\usepackage[lmargin=0.3in, rmargin=0.3in, tmargin=0.9in, bmargin=0.9in, paperwidth=10.0in, paperheight=7.5in]{geometry}
|
||||||
|
\usepackage{multicol}
|
||||||
|
\usepackage{graphicx}
|
||||||
|
\usepackage{fancyhdr}
|
||||||
|
|
||||||
|
\pagestyle{fancy}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\lhead{\bfseries{\Large{Callset concordance - $population$ samples}}}
|
||||||
|
\rhead{}
|
||||||
|
\chead{}
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\begin{tabular}{|r|l|l|}
|
||||||
|
\multicolumn{3}{l}{\bfseries{UG and QCALL}} \\
|
||||||
|
\hline
|
||||||
|
Set & Variants & Ti/Tv \\
|
||||||
|
\hline
|
||||||
|
All SNPs & $Intersection.called.all.nVariantLoci$ & $Intersection.called.all.ti/tv_ratio$ \\
|
||||||
|
Known SNPs & $Intersection.called.known.nVariantLoci$ & $Intersection.called.known.ti/tv_ratio$ \\
|
||||||
|
Novel SNPs & $Intersection.called.novel.nVariantLoci$ & $Intersection.called.novel.ti/tv_ratio$ \\
|
||||||
|
Filtered in UG, present in QCALL & $InUG-FilteredInQCALL.called.all.nVariantLoci$ & $InUG-FilteredInQCALL.called.all.ti/tv_ratio$ \\
|
||||||
|
Filtered in QCALL, present in UG & $InQCALL-FilteredInUG.called.all.nVariantLoci$ & $InQCALL-FilteredInUG.called.all.ti/tv_ratio$ \\
|
||||||
|
\hline
|
||||||
|
\end{tabular} \\[0.1in]
|
||||||
|
|
||||||
|
\includegraphics[width=95mm, type=png, ext=.png, read=.png]{$venndiagram$}
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
\begin{multicols}{2}
|
||||||
|
\hfill
|
||||||
|
\begin{tabular}{|r|l|l|}
|
||||||
|
\multicolumn{3}{l}{\bfseries{UG only}} \\
|
||||||
|
\hline
|
||||||
|
Set & Variants & Ti/Tv \\
|
||||||
|
\hline
|
||||||
|
All SNPs & $UG.called.all.nVariantLoci$ & $UG.called.all.ti/tv_ratio$ \\
|
||||||
|
Known SNPs & $UG.called.known.nVariantLoci$ & $UG.called.known.ti/tv_ratio$ \\
|
||||||
|
Novel SNPs & $UG.called.novel.nVariantLoci$ & $UG.called.novel.ti/tv_ratio$ \\
|
||||||
|
Filtered & $UG.filtered.all.nVariantLoci$ & $UG.filtered.all.ti/tv_ratio$ \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
|
||||||
|
\begin{tabular}{|r|l|l|}
|
||||||
|
\multicolumn{3}{l}{\bfseries{QCALL only}} \\
|
||||||
|
\hline
|
||||||
|
Set & Variants & Ti/Tv \\
|
||||||
|
\hline
|
||||||
|
All SNPs & $QCALL.called.all.nVariantLoci$ & $QCALL.called.all.ti/tv_ratio$ \\
|
||||||
|
Known SNPs & $QCALL.called.known.nVariantLoci$ & $QCALL.called.known.ti/tv_ratio$ \\
|
||||||
|
Novel SNPs & $QCALL.called.novel.nVariantLoci$ & $QCALL.called.novel.ti/tv_ratio$ \\
|
||||||
|
Filtered & $QCALL.filtered.all.nVariantLoci$ & $QCALL.filtered.all.ti/tv_ratio$ \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\end{multicols}
|
||||||
|
|
||||||
|
\lfoot{}
|
||||||
|
\cfoot{}
|
||||||
|
\rfoot{\small{1,000 Genomes Automated Data Processing Report ($timestamp$)}}
|
||||||
|
\end{document}
|
||||||
Loading…
Reference in New Issue