-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpick_random_line.py
executable file
·157 lines (126 loc) · 5.06 KB
/
pick_random_line.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python
#coding:gbk
# Author: pengtao --<[email protected]>
# Purpose:
# 1. pick up random lines (mostly query) from a population file
# History:
# 1. 2012/12/6 update from old version
import math
import sys
import random
import os
import string
from optparse import OptionParser
########################################################################
class CPickRandomLine:
"""pick random line from files
a simple tool to sample the data
usage:
>>> p = CPickRandomLine(['input1.txt', 'input2.txt'])
>>> p.set_output_stream(file('output1', "w"))
>>> p.pick_exact_lines(100)
>>> p.get_file_sizes() # set_file_sizes()
>>> p.set_output_stream(file("output2", "w"))
>>> p.pick_mol_lines(0.13)
>>> p.set_output_stream(file("output3", "w"))
>>> p.pick_mol_lines(300)
"""
#----------------------------------------------------------------------
def __init__(self, files):
"""sizes is the line numbers for each file"""
self.nFile = len(files)
self.Files = files
random.seed(os.urandom(4))
self.FileSizes = None
self.outputstream = None
#----------------------------------------------------------------------
def set_file_sizes(self, sizes):
"""manually set the file sizes other than get_file_sizes.
It's useful when the files are really large.
"""
if self.nFile != len(sizes):
sys.stderr.write("length of files and sizes do not match %s : %s\n" % (self.nFile, len(sizes)))
sys.exit(1)
self.FileSizes = sizes
#----------------------------------------------------------------------
def get_file_sizes(self):
"""get the file sizes by reading file"""
sizes = [0] * self.nFile
for i in range(self.nFile):
fh = file(self.Files[i])
for l in fh:
sizes[i] += 1
fh.close()
self.FileSizes = sizes
#----------------------------------------------------------------------
def set_output_stream(self, out):
"""set the output stream for writing"""
self.outputstream = out
#----------------------------------------------------------------------
def pick_mol_lines(self, p):
"""sample p lines from files
p is treated as a portion if p <= 1
p is treated as a number if p > 1
Note the simplest strategy is employed. the returned result is not exact p lines
mol means 'more or less' :-)
"""
pr = 0
if p <= 0:
sys.stderr.write("wrong portion/number %f for picking random line\n", float(f))
sys.exit(1)
elif p >= 1:
pr = float(p) / sum(self.FileSizes)
else:
pr = p
for fn in self.Files:
fh = file(fn)
for l in fh:
if random.random() < pr:
self.outputstream.write(l)
fh.close()
#----------------------------------------------------------------------
def pick_exact_lines(self, num):
"""
Pick exact num lines from files.
It will load all file in memory. ONLY for small files.
"""
data = []
for fn in self.Files:
fh = file(fn)
for l in fh:
data.append((l, random.random()))
fh.close()
data.sort(key=lambda x: x[1])
if len(data) < num:
print >> sys.stderr, "all lines are %d, less than required %d" % (len(data), num)
num = len(data)
for i in range(num):
self.outputstream.write(data[i][0])
return
#----------------------------------------------------------------------
def parse_args():
"""
"""
parser = OptionParser(usage="usage: %prog [options] [ inputfiles [..] ]")
parser.add_option("-o","--output", default="STDOUT", help="output path. default is 'STDOUT'. " )
parser.add_option("-n","--num", type="float", help="num of lines to pick up. <1 means a ratio and >=1 means a number" )
parser.add_option("-t","--type", default="mol", help="type to pick: 'exact' or 'mol' (more or less). default is 'mol'. When num is a ratio, the program will pick only in a mol way." )
options, args = parser.parse_args()
if not args or options.num is None:
parser.print_help()
sys.exit(0)
if options.output == "STDOUT":
options.output = sys.stdout
else:
options.output = open(options.output, "w")
return (options, args)
if __name__=='__main__':
(options, args) = parse_args()
fileNames = args
p = CPickRandomLine(fileNames)
p.set_output_stream(options.output)
if options.num >= 1 and options.type == "exact":
p.pick_exact_lines(int(options.num))
else:
p.get_file_sizes()
p.pick_mol_lines(options.num)