2 more scripts I found helpful in syncing (and cleaning up) the 1000G mirror

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5287 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2011-02-22 04:17:36 +00:00
parent d7f98ccd9c
commit 63f40215b3
2 changed files with 49 additions and 0 deletions

View File

@ -0,0 +1,43 @@
#!/usr/bin/perl -w
use Getopt::Long;
sub usage {
print "Usage: perl checkMD5s.pl\n\t-ai <alignment.index to check>\n\t-o <file to store results>\n";
exit(1);
}
my $ai = undef;
my $out = undef;
GetOptions( "ai=s" => \$ai,
"o=s" => \$out);
usage() if ( !$ai || !$out );
open(OUT, "> $out") or die "can't open $out: $!";
open(LIST, "< $ai") or die "can't open $ai: $!";
while ( <LIST> ) {
@pieces = split(' ', $_);
if ( @pieces == 6 ) {
check($pieces[0], $pieces[1]);
check($pieces[2], $pieces[3]);
check($pieces[4], $pieces[5]);
}
}
close(LIST);
close(OUT);
sub check {
my $file = $_[0];
my $target = $_[1];
print "Checking /humgen/1kg/DCC/ftp/$file\n";
@md5 = split(' ', `md5sum /humgen/1kg/DCC/ftp/$file`);
if ( $md5[0] ne $target ) {
print OUT "$file\t$md5[0]\t$target\n";
}
}

View File

@ -0,0 +1,6 @@
#!/bin/bash
find /humgen/1kg/DCC/ftp/data/ -type f | awk -F "/" '{print $6 "/" $7 "/" $8 "/" $9}' | sort > filesWeHave.list
grep -v MD5 /humgen/1kg/DCC/ftp/alignment.index | awk '{print $1 "\n" $3 "\n" $5}' | sort > filesWeWant.list
comm -23 filesWeHave.list filesWeWant.list > filesToDelete.list
comm -13 filesWeHave.list filesWeWant.list > filesToGet.list