-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathranks.py
100 lines (80 loc) · 3.01 KB
/
ranks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""Prints ranks for a set of genes in the given sample.
Usage:
ranks.py --sample=<path> --atlas=<path> --gene_list=<path> --tissue=<tissue>
Options:
-h --help Show this screen.
--sample path to sample tabbed-csv, containing at least 'hgnc_symbol' and 'rpkm' columns
--atlas path to atlas tabbed-csv, containing at least 'hgnc_symbol' and <tissue> columns
--gene_list path to file containing list of genes to measure ranks for, one per line
--tissue tissue name as in RNA-seq atlas (e.g. "liver")
"""
import csv
import docopt
import operator
args = docopt.docopt(__doc__)
sample_path = args.get('--sample')
atlas_path = args.get('--atlas')
gene_list_path = args.get('--gene_list')
tissue_name = args.get('--tissue')
def sample_ranks():
print 'Sample counts'
reader = csv.reader(open(sample_path, 'r'), delimiter='\t')
headers = reader.next()
hgnc_pos = headers.index('hgnc_symbol')
rpkm_pos = headers.index('rpkm')
return genes_sorted_by_rpkm(reader, hgnc_pos, rpkm_pos)
def atlas_ranks():
print 'Atlas counts'
reader = csv.reader(open(atlas_path, 'r'), delimiter='\t')
headers = reader.next()
hgnc_pos = headers.index('hgnc_symbol')
rpkm_pos = headers.index(tissue_name)
return genes_sorted_by_rpkm(reader, hgnc_pos, rpkm_pos)
# returns a list of genes sorted by rpkm, descending. for repeat
# genes, uses the first value seen in the file.
def genes_sorted_by_rpkm(reader, hgnc_pos, rpkm_pos):
vals = {}
empty_count = 0
repeat_count = 0
total = 0
for row in reader:
total += 1
gene_name = row[hgnc_pos]
if gene_name == '' or gene_name == 'NA':
empty_count += 1
continue
# data has gene repeats, fine only if the values are the same
rpkm = eval(row[rpkm_pos])
if gene_name in vals:
if abs(rpkm - vals[gene_name]) > 0.00001:
repeat_count += 1
else:
vals[gene_name] = rpkm
print '%d unnamed, %d repeated, %d total rows' % (empty_count, repeat_count, total)
return [x[0] for x in sorted(
vals.iteritems(), key=operator.itemgetter(1), reverse=True)]
def read_gene_list():
return open(gene_list_path).read().split('\n')
def print_ranks(genes, atlas_ranked_genes, sample_ranked_genes):
print '\nABSOLUTE RANK'
for gene in genes:
try:
print '%s: sample %d, atlas %d' % (
gene, sample_ranked_genes.index(gene), atlas_ranked_genes.index(gene))
except ValueError:
print '%s: not found' % gene
print '\nRANK FRACTION'
for gene in genes:
try:
sample_fraction = float(sample_ranked_genes.index(gene))/len(sample_ranked_genes)
atlas_fraction = float(atlas_ranked_genes.index(gene))/len(atlas_ranked_genes)
print '%s: sample %f, atlas %f' % (gene, sample_fraction, atlas_fraction)
except ValueError:
print '%s: not found' % gene
print '\nTotal distinct genes: %d sample, %d atlas' % (
len(sample_ranked_genes), len(atlas_ranked_genes))
if __name__ == "__main__":
sample = sample_ranks()
atlas = atlas_ranks()
genes = read_gene_list()
print_ranks(genes, atlas, sample)