#!/usr/bin/python # ----------------------------------------------------------------------------- # Read in a text file, zip compress it, and convert to base62 encoding # # Usage examples: # # [1] # Compress file and write to multiple output files with set block size of 3000 bytes # # python compress-zlib-base64.py --input infile.txt -v -s 3000 # # This generates multiple output files each of size block size in bytes # inputfile.txt.gz.b62.part0 inputfile.txt.gz.b62.part1 ... # # [2] # Decompresses input files read in sequence: # python compress-zlib-base64.py -d fname.gz.b62.part1 fname.gz.b62.part2 ... # # Generates a single decompressed plain text file and write to stdout # ----------------------------------------------------------------------------- import base64, zlib import sys import getopt import os # ----------------------------------------------------------------------------- # B62 encode # ----------------------------------------------------------------------------- def b62encode(plain): b64 = base64.b64encode(plain) return b64.replace('0', '00').replace('+', '01').replace('/', '02') # ----------------------------------------------------------------------------- # B62 decode # ----------------------------------------------------------------------------- def b62decode(data): b64 = '0'.join(part.replace('01', '+').replace('02', '/') for part in data.split('00')) return base64.b64decode(b64) # ----------------------------------------------------------------------------- # Compress # ----------------------------------------------------------------------------- def compress(plain_text): plain_bytes = plain_text #plain_bytes = plain_text.encode('utf-8') compressed_bytes = zlib.compress(plain_bytes) b62_bytes = b62encode(compressed_bytes) b62_text = b62_bytes.decode('ascii') return b62_text # ----------------------------------------------------------------------------- # Decompress # ----------------------------------------------------------------------------- def decompress(b62_bytes): #binary = b62_text #b62_bytes = b62_text.encode('ascii') compressed_bytes = b62decode(b62_bytes) plain_bytes = zlib.decompress(compressed_bytes) #plain_text = plain_bytes.decode('utf-8') return plain_bytes # ----------------------------------------------------------------------------- # Split into chunks of 3000 # ----------------------------------------------------------------------------- def split_by_n(seq, n): """A generator to divide a sequence into chunks of n units.""" while seq: yield seq[:n] seq = seq[n:] # ----------------------------------------------------------------------------- # Main # ----------------------------------------------------------------------------- _verbose = False _block_size = 0 _decompress = False _input_fname = None _extra_args = [] def main(argv): try: opts, args = getopt.getopt(argv, "i:s:dhv", ["input=", "size=", "decompress", "help", "verbose"]) except getopt.GetoptError: usage() sys.exit(2) index = 0 for opt, arg in opts: if opt in ("-i", "--input"): if arg: global _input_fname; _input_fname = arg elif opt in ("-s", "--size"): if arg: global _block_size; _block_size = int(arg) elif opt in ("-d", "--decompress"): global _decompress; _decompress = True elif opt in ("-h", "--h", "--help"): usage() sys.exit() elif opt in ("-v", "--verbose"): global _verbose; _verbose = True index += 1 index += 1 global _extra_args; _extra_args = [] for a in sys.argv[index:]: _extra_args.append(a) def usage(): print "util -i input-file-name" print print "\t-i --input file-name" print "\t-s --size size of each split file" print "\t-d --decompress switch followed list of files eg. -d f.txt.gz.b62.part0 f.txt.gz.b62.part1" if __name__ == "__main__": main(sys.argv[1:]) if _verbose: print "Input file name: ", _input_fname print "Size of block files: ", _block_size print "Decompress:", _decompress print "Other arguments:", _extra_args if _input_fname: # Compress plain_bytes = "" with open(_input_fname, 'r') as fd: for line in fd: plain_bytes += line fd.close() b62_text = compress(plain_bytes) if _block_size > 0: # eg 3000 arr = list(split_by_n(b62_text, _block_size)) else: arr = [b62_text] # Write to output basename = os.path.basename(_input_fname) for i, block in enumerate(arr): if _verbose: print "Block", i print block print len(block) print output_fname = basename + ".gz.b62.part" + str(i) print "Writing to file ", output_fname text_file = open(output_fname, "w") text_file.write(block) text_file.close() else: # Decompress concat_string = "" for fname in _extra_args: plain_bytes = "" with open(fname, 'r') as fd: for line in fd: plain_bytes += line concat_string += plain_bytes #print concat_string p = decompress(concat_string) print p