2017-10-20 20:33:43 +08:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
#
|
|
|
|
# Sort large text files in a minimum amount of memory
|
|
|
|
#
|
|
|
|
import argparse
|
2020-07-06 15:44:19 +08:00
|
|
|
import os
|
2017-10-20 20:33:43 +08:00
|
|
|
|
2019-10-05 13:14:13 +08:00
|
|
|
|
2020-01-03 22:25:36 +08:00
|
|
|
class FileSplitter:
|
2019-10-05 13:14:13 +08:00
|
|
|
BLOCK_FILENAME_FORMAT = "block_{0}.dat"
|
2017-10-20 20:33:43 +08:00
|
|
|
|
|
|
|
def __init__(self, filename):
|
|
|
|
self.filename = filename
|
|
|
|
self.block_filenames = []
|
|
|
|
|
|
|
|
def write_block(self, data, block_number):
|
|
|
|
filename = self.BLOCK_FILENAME_FORMAT.format(block_number)
|
2019-10-05 13:14:13 +08:00
|
|
|
with open(filename, "w") as file:
|
2019-01-08 16:59:23 +08:00
|
|
|
file.write(data)
|
2017-10-20 20:33:43 +08:00
|
|
|
self.block_filenames.append(filename)
|
|
|
|
|
|
|
|
def get_block_filenames(self):
|
|
|
|
return self.block_filenames
|
|
|
|
|
|
|
|
def split(self, block_size, sort_key=None):
|
|
|
|
i = 0
|
2019-01-08 16:59:23 +08:00
|
|
|
with open(self.filename) as file:
|
|
|
|
while True:
|
|
|
|
lines = file.readlines(block_size)
|
2017-10-20 20:33:43 +08:00
|
|
|
|
2019-01-08 16:59:23 +08:00
|
|
|
if lines == []:
|
|
|
|
break
|
2017-10-20 20:33:43 +08:00
|
|
|
|
2019-01-08 16:59:23 +08:00
|
|
|
if sort_key is None:
|
|
|
|
lines.sort()
|
|
|
|
else:
|
|
|
|
lines.sort(key=sort_key)
|
2017-10-20 20:33:43 +08:00
|
|
|
|
2019-10-05 13:14:13 +08:00
|
|
|
self.write_block("".join(lines), i)
|
2019-01-08 16:59:23 +08:00
|
|
|
i += 1
|
2017-10-20 20:33:43 +08:00
|
|
|
|
|
|
|
def cleanup(self):
|
|
|
|
map(lambda f: os.remove(f), self.block_filenames)
|
|
|
|
|
|
|
|
|
2020-01-03 22:25:36 +08:00
|
|
|
class NWayMerge:
|
2017-10-20 20:33:43 +08:00
|
|
|
def select(self, choices):
|
|
|
|
min_index = -1
|
|
|
|
min_str = None
|
|
|
|
|
|
|
|
for i in range(len(choices)):
|
|
|
|
if min_str is None or choices[i] < min_str:
|
|
|
|
min_index = i
|
|
|
|
|
|
|
|
return min_index
|
|
|
|
|
|
|
|
|
2020-01-03 22:25:36 +08:00
|
|
|
class FilesArray:
|
2017-10-20 20:33:43 +08:00
|
|
|
def __init__(self, files):
|
|
|
|
self.files = files
|
|
|
|
self.empty = set()
|
|
|
|
self.num_buffers = len(files)
|
|
|
|
self.buffers = {i: None for i in range(self.num_buffers)}
|
|
|
|
|
|
|
|
def get_dict(self):
|
2019-10-05 13:14:13 +08:00
|
|
|
return {
|
|
|
|
i: self.buffers[i] for i in range(self.num_buffers) if i not in self.empty
|
|
|
|
}
|
2017-10-20 20:33:43 +08:00
|
|
|
|
|
|
|
def refresh(self):
|
|
|
|
for i in range(self.num_buffers):
|
|
|
|
if self.buffers[i] is None and i not in self.empty:
|
|
|
|
self.buffers[i] = self.files[i].readline()
|
|
|
|
|
2019-10-05 13:14:13 +08:00
|
|
|
if self.buffers[i] == "":
|
2017-10-20 20:33:43 +08:00
|
|
|
self.empty.add(i)
|
2019-01-08 16:59:23 +08:00
|
|
|
self.files[i].close()
|
2017-10-20 20:33:43 +08:00
|
|
|
|
|
|
|
if len(self.empty) == self.num_buffers:
|
|
|
|
return False
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
def unshift(self, index):
|
|
|
|
value = self.buffers[index]
|
|
|
|
self.buffers[index] = None
|
|
|
|
|
|
|
|
return value
|
|
|
|
|
|
|
|
|
2020-01-03 22:25:36 +08:00
|
|
|
class FileMerger:
|
2017-10-20 20:33:43 +08:00
|
|
|
def __init__(self, merge_strategy):
|
|
|
|
self.merge_strategy = merge_strategy
|
|
|
|
|
|
|
|
def merge(self, filenames, outfilename, buffer_size):
|
|
|
|
buffers = FilesArray(self.get_file_handles(filenames, buffer_size))
|
2019-10-05 13:14:13 +08:00
|
|
|
with open(outfilename, "w", buffer_size) as outfile:
|
2019-01-08 16:59:23 +08:00
|
|
|
while buffers.refresh():
|
|
|
|
min_index = self.merge_strategy.select(buffers.get_dict())
|
|
|
|
outfile.write(buffers.unshift(min_index))
|
2017-10-20 20:33:43 +08:00
|
|
|
|
|
|
|
def get_file_handles(self, filenames, buffer_size):
|
|
|
|
files = {}
|
|
|
|
|
|
|
|
for i in range(len(filenames)):
|
2019-10-05 13:14:13 +08:00
|
|
|
files[i] = open(filenames[i], "r", buffer_size)
|
2017-10-20 20:33:43 +08:00
|
|
|
|
|
|
|
return files
|
|
|
|
|
|
|
|
|
2020-01-03 22:25:36 +08:00
|
|
|
class ExternalSort:
|
2017-10-20 20:33:43 +08:00
|
|
|
def __init__(self, block_size):
|
|
|
|
self.block_size = block_size
|
|
|
|
|
|
|
|
def sort(self, filename, sort_key=None):
|
|
|
|
num_blocks = self.get_number_blocks(filename, self.block_size)
|
|
|
|
splitter = FileSplitter(filename)
|
|
|
|
splitter.split(self.block_size, sort_key)
|
|
|
|
|
|
|
|
merger = FileMerger(NWayMerge())
|
|
|
|
buffer_size = self.block_size / (num_blocks + 1)
|
2019-10-05 13:14:13 +08:00
|
|
|
merger.merge(splitter.get_block_filenames(), filename + ".out", buffer_size)
|
2017-10-20 20:33:43 +08:00
|
|
|
|
|
|
|
splitter.cleanup()
|
|
|
|
|
|
|
|
def get_number_blocks(self, filename, block_size):
|
|
|
|
return (os.stat(filename).st_size / block_size) + 1
|
|
|
|
|
|
|
|
|
|
|
|
def parse_memory(string):
|
2019-10-05 13:14:13 +08:00
|
|
|
if string[-1].lower() == "k":
|
2017-10-20 20:33:43 +08:00
|
|
|
return int(string[:-1]) * 1024
|
2019-10-05 13:14:13 +08:00
|
|
|
elif string[-1].lower() == "m":
|
2017-10-20 20:33:43 +08:00
|
|
|
return int(string[:-1]) * 1024 * 1024
|
2019-10-05 13:14:13 +08:00
|
|
|
elif string[-1].lower() == "g":
|
2017-10-20 20:33:43 +08:00
|
|
|
return int(string[:-1]) * 1024 * 1024 * 1024
|
|
|
|
else:
|
|
|
|
return int(string)
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser()
|
2019-10-05 13:14:13 +08:00
|
|
|
parser.add_argument(
|
|
|
|
"-m", "--mem", help="amount of memory to use for sorting", default="100M"
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"filename", metavar="<filename>", nargs=1, help="name of file to sort"
|
|
|
|
)
|
2017-10-20 20:33:43 +08:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
sorter = ExternalSort(parse_memory(args.mem))
|
|
|
|
sorter.sort(args.filename[0])
|
|
|
|
|
|
|
|
|
2019-10-05 13:14:13 +08:00
|
|
|
if __name__ == "__main__":
|
2017-11-25 17:23:50 +08:00
|
|
|
main()
|