forked from cmungall/obo-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathobo-split.pl
executable file
·112 lines (95 loc) · 2.2 KB
/
obo-split.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/perl -w
use strict;
use FileHandle;
my $chunksize = 10000;
my $cmd;
while ($ARGV[0] =~ /^\-/) {
my $opt = shift @ARGV;
if ($opt eq '-h' || $opt eq '--help') {
print usage();
exit 0;
}
if ($opt eq '-s' || $opt eq '--chunksize') {
$chunksize = shift @ARGV;
}
if ($opt eq '-x' || $opt eq '--iterate') {
while (@ARGV) {
my $next = shift @ARGV;
if ($next eq ';') {
last;
}
$cmd.= "$next ";
}
if (!@ARGV) {
die "-x must end with \; I got:\n$cmd";
}
}
}
while (@ARGV) {
chunk(shift @ARGV);
}
exit 0;
sub chunk {
my $f = shift;
my $in_header = 1;
my $hdr = '';
my $n = 0;
my $oh;
my $ih = FileHandle->new($f) || die $f;
while(<$ih>) {
if (/^\[/) {
if ($n % $chunksize == 0) {
my $chunkid = int($n/$chunksize);
$oh->close if $oh;
$oh = outhandle($f,$chunkid+1);
print $oh $hdr;
}
$n++;
$in_header = 0;
print $oh $_;
}
else {
if ($in_header) {
$hdr .= $_;
}
else {
print $oh $_;
}
}
}
$ih->close;
$oh->close;
}
sub outhandle {
my ($f,$id) = @_;
if ($cmd) {
my $oh = FileHandle->new("|$cmd") || die $cmd;
return $oh;
}
else {
my $orig = $f;
$f =~ s/\.obo/.chunk-$id\.obo/;
die if $f eq $orig;
my $oh = FileHandle->new(">$f") || die $f;
return $oh;
}
}
sub scriptname {
my @p = split(/\//,$0);
pop @p;
}
sub usage {
my $sn = scriptname();
<<EOM;
$sn [-s chunksize] [-x COMMAND \;] OBO-FILES
Splits obo file into chunks. the main reason to do this is
pre-processing before passing to memory intensive applications
-s --chunksize : number of stanzas per file/command. default 10000
-x --iterate : command to iterate over each chunk
Example:
$sn -c 1000 chebi.obo
--breaks chebi into files of size 1000
$sn -x obo-grep.pl -r calcium - \; chebi.obo
--iterates through chebi running obo-grep on each chunk
EOM
}