Sunday, December 28, 2008

Community Participation | Import This!

Our authors are having an impact through their participation in the global Python community, and we couldn't be more proud to support them.

This column was originally published by Python Magazine in August of 2008.

Read More

PyMOTW now uses Paver

I've converted the build and packaging infrastructure for PyMOTW from Makefiles to use Kevin Dangoor's Paver instead. Today's release (1.79) is the first version released under the new build rules, so let me know if you run into problems with it.

PyMOTW: zlib

zlib – Low-level access to GNU zlib compression library

Purpose:Low-level access to GNU zlib compression library
Python Version:2.5 and later

The zlib module provides a lower-level interface to many of the functions in the zlib compression library from GNU.

Working with Data in Memory

The simplest way to work with zlib requires holding all of the data to be compressed or decompressed in memory, and then using compress() and decompress().

import zlib
import binascii

original_data = 'This is the original text.'
print 'Original :', len(original_data), original_data

compressed = zlib.compress(original_data)
print 'Compressed :', len(compressed), binascii.hexlify(compressed)

decompressed = zlib.decompress(compressed)
print 'Decompressed :', len(decompressed), decompressed
$ python zlib_memory.py
Original : 26 This is the original text.
Compressed : 32 789c0bc9c82c5600a2928c5485fca2ccf4ccbcc41c8592d48a123d007f2f097e
Decompressed : 26 This is the original text.

Notice that for short text, the compressed version can be longer. While the actual results depend on the input data, for short bits of text it is interesting to observe the compression overhead.

import zlib

original_data = 'This is the original text.'

fmt = '%15s %15s'
print fmt % ('len(data)', 'len(compressed)')
print fmt % ('-' * 15, '-' * 15)

for i in xrange(20):
data = original_data * i
compressed = zlib.compress(data)
print fmt % (len(data), len(compressed)), '*' if len(data) < len(compressed) else ''
$ python zlib_lengths.py
len(data) len(compressed)
--------------- ---------------
0 8 *
26 32 *
52 35
78 35
104 36
130 36
156 36
182 36
208 36
234 36
260 36
286 36
312 37
338 37
364 38
390 38
416 38
442 38
468 38
494 38

Working with Streams

The in-memory approach has obvious drawbacks that make it impractical for real-world use cases. The alternative is to use Compress and Decompress objects to manipulate streams of data, so that the entire data set does not have to fit into memory.

The simple server below responds to requests consisting of filenames by writing a compressed version of the file to the socket used to communicate with the client. It has some artificial chunking in place to illustrate the buffering behavior that happens when the data passed to compress() or decompress() doesn’t result in a complete block of compressed or uncompressed output.

Warning

This server has obvious security implications. Do not run it on a server on the
open internet or in any environment where security might be an issue.

from __future__ import with_statement
import zlib
import logging
import SocketServer
import binascii

BLOCK_SIZE = 64

class ZlibRequestHandler(SocketServer.BaseRequestHandler):

logger = logging.getLogger('Server')

def handle(self):
compressor = zlib.compressobj(1)

# Find out what file the client wants
filename = self.request.recv(1024)
self.logger.debug('client asked for: "%s"', filename)

# Send chunks of the file as they are compressed
with open(filename, 'rb') as input:
while True:
block = input.read(BLOCK_SIZE)
if not block:
break
self.logger.debug('RAW "%s"', block)
compressed = compressor.compress(block)
if compressed:
self.logger.debug('SENDING "%s"', binascii.hexlify(compressed))
self.request.send(compressed)
else:
self.logger.debug('BUFFERING')

# Send any data being buffered by the compressor
remaining = compressor.flush()
while remaining:
to_send = remaining[:BLOCK_SIZE]
remaining = remaining[BLOCK_SIZE:]
self.logger.debug('FLUSHING "%s"', binascii.hexlify(to_send))
self.request.send(to_send)
return


if __name__ == '__main__':
import socket
import threading
from cStringIO import StringIO

logging.basicConfig(level=logging.DEBUG,
format='%(name)s: %(message)s',
)
logger = logging.getLogger('Client')

# Set up a server, running in a separate thread
address = ('localhost', 0) # let the kernel give us a port
server = SocketServer.TCPServer(address, ZlibRequestHandler)
ip, port = server.server_address # find out what port we were given

t = threading.Thread(target=server.serve_forever)
t.setDaemon(True)
t.start()

# Connect to the server
logger.info('Contacting server on %s:%s', ip, port)
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((ip, port))

# Ask for a file
requested_file = 'lorem.txt'
logger.debug('sending filename: "%s"', requested_file)
len_sent = s.send(requested_file)

# Receive a response
buffer = StringIO()
decompressor = zlib.decompressobj()
while True:
response = s.recv(BLOCK_SIZE)
if not response:
break
logger.debug('READ "%s"', binascii.hexlify(response))

# Include any unconsumed data when feeding the decompressor.
to_decompress = decompressor.unconsumed_tail + response
while to_decompress:
decompressed = decompressor.decompress(to_decompress)
if decompressed:
logger.debug('DECOMPRESSED "%s"', decompressed)
buffer.write(decompressed)
# Look for unconsumed data due to buffer overflow
to_decompress = decompressor.unconsumed_tail
else:
logger.debug('BUFFERING')
to_decompress = None

# deal with data reamining inside the decompressor buffer
remainder = decompressor.flush()
if remainder:
logger.debug('FLUSHED "%s"', remainder)
buffer.write(reaminder)

full_response = buffer.getvalue()
lorem = open('lorem.txt', 'rt').read()
logger.debug('response matches file contents: %s', full_response == lorem)

# Clean up
s.close()
server.socket.close()
$ python zlib_server.py
Client: Contacting server on 127.0.0.1:51426
Client: sending filename: "lorem.txt"
Server: client asked for: "lorem.txt"
Server: RAW "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Donec
"
Server: SENDING "7801"
Server: RAW "egestas, enim et consectetuer ullamcorper, lectus ligula rutrum "
Client: READ "7801"
Client: BUFFERING
Server: BUFFERING
Server: RAW "leo, a
elementum elit tortor eu quam. Duis tincidunt nisi ut ant"
Server: BUFFERING
Server: RAW "e. Nulla
facilisi. Sed tristique eros eu libero. Pellentesque ve"
Server: BUFFERING
Server: RAW "l arcu. Vivamus
purus orci, iaculis ac, suscipit sit amet, pulvi"
Server: BUFFERING
Server: RAW "nar eu,
lacus. Praesent placerat tortor sed nisl. Nunc blandit d"
Server: BUFFERING
Server: RAW "iam egestas
dui. Pellentesque habitant morbi tristique senectus "
Server: BUFFERING
Server: RAW "et netus et
malesuada fames ac turpis egestas. Aliquam viverra f"
Server: BUFFERING
Server: RAW "ringilla
leo. Nulla feugiat augue eleifend nulla. Vivamus mauris"
Server: BUFFERING
Server: RAW ". Vivamus sed
mauris in nibh placerat egestas. Suspendisse poten"
Server: BUFFERING
Server: RAW "ti. Mauris massa. Ut
eget velit auctor tortor blandit sollicitud"
Server: BUFFERING
Server: RAW "in. Suspendisse imperdiet
justo.
"
Server: BUFFERING
Server: FLUSHING "5592418edb300c45f73e050f60f80e05ba6c8b0245bb676426c382923c22e9f3f70bc94c1ac00b9b963eff7fe4b73ea4921e9e95f66e7d906b105789954a6f2e"
Server: FLUSHING "25245206f1ae877ad17623318d8dbef62665919b78b0af244d2b49bc5e4a33aea58f43c64a06ad7432bda5318d8c819e267d255ec4a44a0b14a638451f784892"
Client: READ "5592418edb300c45f73e050f60f80e05ba6c8b0245bb676426c382923c22e9f3f70bc94c1ac00b9b963eff7fe4b73ea4921e9e95f66e7d906b105789954a6f2e"
Server: FLUSHING "de932b7aa53a85b6a27bb6a0a6ae94b0d94236fa31bb2c572e6aa86ff44b768aa11efa9e4232ba4f21d30b5e37fa2966e8243e7f9e62c4a3e4467ff4e49abe1c"
Server: FLUSHING "39e0b18fa22b299784247159c913d90f587be239d24e6d3c6dae8be1ac437db038e4e94041067f467198826d9b765ba18b71dba1b62b23f29de1b227dcbff87b"
Client: DECOMPRESSED "Lorem ipsum dolor sit amet, conse"
Server: FLUSHING "e38b065252ede3a2ffa5428f3b4d106f181022c652d9c49377a62b06387d53e4c0d43e3a6cf4c500052d4f3d650c1c1c18a84e7e18c403255d256f0aeb9cb709"
Client: READ "25245206f1ae877ad17623318d8dbef62665919b78b0af244d2b49bc5e4a33aea58f43c64a06ad7432bda5318d8c819e267d255ec4a44a0b14a638451f784892"
Client: DECOMPRESSED "ctetuer adipiscing elit. Donec
egestas, enim et consectetuer ullamcorper, lectus ligula rutrum leo, a
elementum elit tortor eu"
Server: FLUSHING "d044afd2607f72fe24459513909fdf480807b346da90f5f2f684f04888d9a41fd05277a1a3074821f2f7fbadcaeed0ff1d73a962ce666e6296b9098f85f8c0e6"
Client: READ "de932b7aa53a85b6a27bb6a0a6ae94b0d94236fa31bb2c572e6aa86ff44b768aa11efa9e4232ba4f21d30b5e37fa2966e8243e7f9e62c4a3e4467ff4e49abe1c"
Server: FLUSHING "dd4c8b46eeda5e45b562d776058dbfe9d1b7e51f6f370ea5"
Client: DECOMPRESSED " quam. Duis tincidunt nisi ut ante. Nulla
facilisi. Sed tristique eros eu libero. Pellentesque vel arcu. Vivamus
p"
Client: READ "39e0b18fa22b299784247159c913d90f587be239d24e6d3c6dae8be1ac437db038e4e94041067f467198826d9b765ba18b71dba1b62b23f29de1b227dcbff87b"
Client: DECOMPRESSED "urus orci, iaculis ac, suscipit sit amet, pulvinar eu,
lacus. Praesent placerat tortor sed nisl. Nunc blandit diam egestas
dui. Pellentesque "
Client: READ "e38b065252ede3a2ffa5428f3b4d106f181022c652d9c49377a62b06387d53e4c0d43e3a6cf4c500052d4f3d650c1c1c18a84e7e18c403255d256f0aeb9cb709"
Client: DECOMPRESSED "habitant morbi tristique senectus et netus et
malesuada fames ac turpis egestas. Aliquam viverra fringilla
leo. Nulla feugiat aug"
Client: READ "d044afd2607f72fe24459513909fdf480807b346da90f5f2f684f04888d9a41fd05277a1a3074821f2f7fbadcaeed0ff1d73a962ce666e6296b9098f85f8c0e6"
Client: DECOMPRESSED "ue eleifend nulla. Vivamus mauris. Vivamus sed
mauris in nibh placerat egestas. Suspendisse potenti. Mauris massa. Ut
eget velit auctor tortor blandit s"
Client: READ "dd4c8b46eeda5e45b562d776058dbfe9d1b7e51f6f370ea5"
Client: DECOMPRESSED "ollicitudin. Suspendisse imperdiet
justo.
"
Client: response matches file contents: True

Mixed Content Streams

The Decompress class returned by decompressobj() can also be used in situations where compressed and uncompressed data is mixed together. After decompressing all of the data, the unused_data attribute contains any data not used.

import zlib

lorem = open('lorem.txt', 'rt').read()
compressed = zlib.compress(lorem)
combined = compressed + lorem

decompressor = zlib.decompressobj()
decompressed = decompressor.decompress(combined)

print 'Decompressed matches lorem:', decompressed == lorem
print 'Unused data matches lorem :', decompressor.unused_data == lorem
$ python zlib_mixed.py
Decompressed matches lorem: True
Unused data matches lorem : True

Checksums

In addition to compression and decompression functions, zlib includes two functions for computing checksums of data, adler32() and crc32(). Neither checksum is billed as cryptographically secure, and they are only intended for use for data integrity verification.

Both functions take the same arguments, a string of data and an optional value to be used as a starting point for the checksum. They return a 32-bit signed integer value which can also be passed back on subsequent calls as a new starting point argument to produce a running checksum.

import zlib

data = open('lorem.txt', 'r').read()

cksum = zlib.adler32(data)
print 'Adler32: %12d' % cksum
print ' : %12d' % zlib.adler32(data, cksum)

cksum = zlib.crc32(data)
print 'CRC-32 : %12d' % cksum
print ' : %12d' % zlib.crc32(data, cksum)
$ python zlib_checksums.py
Adler32: 1865879205
: 118955337
CRC-32 : 1878123957
: -1940264325

The Adler32 algorithm is said to be faster than a standard CRC, but I found it to be slower in some tests.

import timeit

iterations = 1000000

def show_results(title, result, iterations):
"Print results in terms of microseconds per pass and per item."
per_pass = 1000000 * (result / iterations)
print '%s:\t%.2f usec/pass' % (title, per_pass)


adler32 = timeit.Timer(
stmt="zlib.adler32(data)",
setup="import zlib; data=open('lorem.txt','r').read() * 10",
)
show_results('Adler32, separate', adler32.timeit(iterations), iterations)

adler32_running = timeit.Timer(
stmt="cksum = zlib.adler32(data, cksum)",
setup="import zlib; data=open('lorem.txt','r').read() * 10; cksum = zlib.adler32(data)",
)
show_results('Adler32, running', adler32_running.timeit(iterations), iterations)

crc32 = timeit.Timer(
stmt="zlib.crc32(data)",
setup="import zlib; data=open('lorem.txt','r').read() * 10",
)
show_results('CRC-32, separate', crc32.timeit(iterations), iterations)

crc32_running = timeit.Timer(
stmt="cksum = zlib.crc32(data, cksum)",
setup="import zlib; data=open('lorem.txt','r').read() * 10; cksum = zlib.crc32(data)",
)
show_results('CRC-32, running', crc32_running.timeit(iterations), iterations)
$ python zlib_checksum_tests.py
Adler32, separate: 30.67 usec/pass
Adler32, running: 30.43 usec/pass
CRC-32, separate: 22.94 usec/pass
CRC-32, running: 23.79 usec/pass

See also

zlib
The standard library documentation for this module.
gzip
The gzip module includes a higher level (file-based) interface to the zlib library.
http://www.zlib.net/
Home page for zlib library.
http://www.zlib.net/manual.html
Complete zlib documentation.
bz2
The bz2 module provides a similar interface to the bzip2 compression library.

PyMOTW Home

Monday, December 22, 2008

Python Magazine for December 2008



The December 2008 issue of Python Magazine is available for download now.

Our cover story this month is Using the Cloud, in which Patrick Altman shows you how to wield the power of globally distributed data centers with infinite storage, immediate scalability, and no fixed costs or large capital outlays using Python.

Michael Schneider brings us Design Patterns in Real Python Projects, a survey of design patterns uses found "in the wild".

JC Cruz continues his series of OS X programming tutorials in Tabular Editing With PyObjC. This time, he covers editing data in a table control.

WikidBASE: Semi-Structured Data Management (in Seconds) by Nick Blundell shows you how to create data management applications without pre-determining a database schema and letting your app grow over time.

Jesse Noller's column, YAML Ain't Markup Language, gives us several good reasons to consider YAML as a substitute for XML, especially in files meant to be read or written by humans.

Mark Mruss shows us how easy it can be to work with 3-D graphics using VPython.

And this month Steve Holden takes a look at various developments in and around the Python language, reports on an extensive discussion on Python's function call mechanism, and reminds you about Python conferences.

Finally, I recount some of my experiences from PyWorks 2008 and look ahead at how the magazine will grow in 2009.

Thursday, December 18, 2008

virtualenvwrapper 1.6.1, even MORE zsh support

Byron Clark contributed a patch to make rmvirtualenv work with zsh, so I packaged up release 1.6.1 of virtualenvwrapper.

What's the deal with zsh, why is it so popular? What am I missing?

Wednesday, December 17, 2008

virtualenvwrapper 1.6 released

The latest release of virtualenvwrapper includes completion support for zsh, courtesy of a patch from Ted Leung. Thanks, Ted!

Saturday, December 13, 2008

Moving PyMOTW to a public repository

I'm investigating options for hosted version control tools to managed the PyMOTW sources. Currently the files are in a private svn repository, but that makes collaboration difficult. This also gives me an excuse to look at DVCS tools to see how they compare.

Contenders:

There are 4 contenders, listed here in no particular order:

  1. Google Code (svn)
  2. Bitbucket (mercurial)
  3. Launchpad (bazaar)
  4. GitHub (git)


I'm not all that keen on git, mostly because my impression is that it
is hard to use. I have no direct experience, so obviously that has to
be taken with at least a grain of salt.

I'm familiar with svn, have a good tool chain built around it, and use
it on my other projects, including Google Code. I don't have a good
sense for the barrier to entry for other users, though. Do people still choose to start projects on svn, or is everyone moving to the newer DVCS-based repositories?

The two other DVCS options seem to have mostly similar feature sets,
although I'm still reading about them so maybe I haven't hit the
differentiating features, yet. I know that the Sphinx project uses
Bitbucket and I've been told the Python core developers may be looking
at bzr (but probably not Launchpad).

Google Code:

I use Google Code for some other projects, and we use svn at work. I'm using that experience as a base-line for evaluating the other options.

Bitbucket / Mercurial:

I was able to install Mercurial with a simple:

$ easy_install Mercurial


I found instructions for enabling the "convert" plugin so I could import my svn repository. My first attempt failed because hg couldn't find the python bindings for svn. @cyberdrow helped me out via Twitter, and after setting my PYTHONPATH="/opt/subversion/lib/svn-python" I was all set.

The conversion took a little while to run, but eventually I had a local Mercurial repository with all of the trunk changes. I wanted the "releases" tree to be used for tags, but I guess that's not a standard naming convention so the tool ignored that part of the tree. No matter, I can work out where to set the tags myself or investigate other options later.

Bitbucket seems like a fairly streamlined code hosting solution. In addition to Mercurial hosting, there's a wiki and an issue tracker. Private repositories < 150 MB are free, so that should cover my needs for this project.

It looks like I'll only have the ability to create a single repository under the free plan, though, so I'll have to consider whether I want to get into the paid hosting plans for other projects. At $5/month, I'd be able justify subscribing for several years before I matched the cost of the effort it would take for me to set up a secure server with the same features myself.

Launchpad / Bazaar:

Importing into Bazaar was a little more confusing, ironically because of the transparent way they have implemented it. The bzr-svn plugin lets you "branch" directly from a subversion repository to create a local copy managed by bzr. Apparently changes can then be pushed back to the original svn repository, although that's not what I intend to do.

The version of bzr I got when I ran easy_install didn't include the bzr-svn plugin, but after deleting that copy and downloading the OS X installer from the web site I was able to run:

$ bzr branch svn+ssh://server/path/to/svn/repository PyMOTW-bzr


to create a local bzr repository complete with all of the change history. This time it did import the release history so I had "trunk" and "releases" directories in the local copy. Including the "trunk" directory in the branch URL gave me something more closely matching the results from "hg convert".

Launchpad seems to have more features than Bitbucket. One nice feature for code projects is "Blueprints", which can be used to discuss designs and plans before beginning implementation. There doesn't seem to be a general-purpose wiki, but PyMOTW doesn't really need much of one anyway.

The Launchpad UI feels a little less obvious to me. Perhaps that has to do with the fact that most of the views show combined data for all projects, not just my own. That's an interesting approach for open source in general, but not necessarily applicable for this project.

More Research Needed:

That's all the research I've had time to do, so far. I'll be looking at git and GitHub soon because I'd like to try all of the various options before making a decision.

In addition to the basic commands, I need to make sure I understand how to formally branch with each tool. I tend to work on the article for a module by creating a new branch, then merging back into the trunk when I'm done. I'm paranoid about backing up work-in-progress, so I really want a way to commit to a branch hosted somewhere other than my laptop. In svn, I use the usual "svn copy" commands to create a new working tree within the repository. Mercurial has a notion of branches, but it works differently (perhaps more like the old CVS branches?) so I need to study it more. Bazaar seems to create an entire copy of the repository when a branch is made, rather than storing the branch in a way that lets me save copies of data in the central location. It isn't clear yet if I can do something to create the branch on the shared repository.

Sunday, December 7, 2008

PyMOTW: gzip


gzip – Read and write gzip files

Purpose:Read and write gzip files.
Python Version:1.5.2 and later

The gzip module provides a file-like interface to GNU zip files, using zlib to compress and uncompress the data.

Writing Compressed Files

The module-level function open() creates an instance of the file-like class GzipFile. The usual methods for writing and reading data are provided. To write data into a compressed file, open the file with mode 'w'.

import gzip
import os

output = gzip.open('example.txt.gz', 'wb')
try:
output.write('Contents of the example file go here.\n')
finally:
output.close()

os.system('ls -l example.txt.gz')
os.system('file example.txt.gz')
$ python gzip_write.py
-rw-r--r-- 1 dhellmann dhellmann 68 Dec 7 10:44 example.txt.gz
example.txt.gz: gzip compressed data, was "example.txt", last modified: Sun Dec 7 10:44:42 2008, max compression

Different compression levels can be used by passing a compresslevel argument. Valid values range from 1 to 9, inclusive. Lower values are faster and result in less compression. Higher values are slower and compress more, up to a point.

import gzip
import os

data = open('lorem.txt', 'r').read() * 1024
print 'Input contains %d bytes' % len(data)

for i in xrange(1, 10):
filename = 'compress-level-%s.gz' % i
output = gzip.open(filename, 'wb', compresslevel=i)
try:
output.write(data)
finally:
output.close()
os.system('cksum %s' % filename)

The center column of numbers in the output of the script is the size in bytes of the files produced. As you see, for this input data, the higher compression values do not necessarily pay off in decreased storage space. Results will vary, depending on the input data.

$ python gzip_compresslevel.py
Input contains 754688 bytes
999397133 9839 compress-level-1.gz
2612203818 8260 compress-level-2.gz
3676863750 8221 compress-level-3.gz
4292954809 4160 compress-level-4.gz
3686111199 4160 compress-level-5.gz
3075010677 4160 compress-level-6.gz
2468097299 4160 compress-level-7.gz
1221970342 4160 compress-level-8.gz
1820398784 4160 compress-level-9.gz

A GzipFile instance also includes a writelines() method that can be used to write a sequence of strings.

import gzip
import itertools
import os

output = gzip.open('example_lines.txt.gz', 'wb')
try:
output.writelines(itertools.repeat('The same line, over and over.\n', 10))
finally:
output.close()

os.system('gzcat example_lines.txt.gz')
$ python gzip_writelines.py
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.


Reading Compressed Data

To read data back from previously compressed files, simply open the file with mode 'r'.

import gzip

input_file = gzip.open('example.txt.gz', 'rb')
try:
print input_file.read()
finally:
input_file.close()

This example reads the file written by gzip_write.py from the previous section.

$ python gzip_read.py
Contents of the example file go here.

While reading a file, it is also possible to seek and read only part of the data.

import gzip

input_file = gzip.open('example.txt.gz', 'rb')
try:
print 'Entire file:'
all_data = input_file.read()
print all_data

expected = all_data[5:15]

# rewind to beginning
input_file.seek(0)

# move ahead 5 bytes
input_file.seek(5)
print 'Starting at position 5 for 10 bytes:'
partial = input_file.read(10)
print partial

print
print expected == partial
finally:
input_file.close()

The seek() position is relative to the uncompressed data, so the caller does not even need to know that the data file is compressed.

$ python gzip_seek.py
Entire file:
Contents of the example file go here.

Starting at position 5 for 10 bytes:
nts of the

True

Working with Streams

It is possible to use the GzipFile class directly to compress or uncompress a data stream, instead of an entire file. This is useful for working with data being transmitted over a socket or from an existing (open) file handle. A StringIO buffer can also be used.

import gzip
from cStringIO import StringIO
import binascii

uncompressed_data = 'The same line, over and over.\n' * 10
print 'UNCOMPRESSED:', len(uncompressed_data)
print uncompressed_data

buf = StringIO()
f = gzip.GzipFile(mode='wb', fileobj=buf)
f.write(uncompressed_data)
f.flush()

compressed_data = buf.getvalue()
print 'COMPRESSED:', len(compressed_data)
print binascii.hexlify(compressed_data)

inbuffer = StringIO(compressed_data)
f = gzip.GzipFile(mode='rb', fileobj=inbuffer)
reread_data = f.read()#len(uncompressed_data))
f.close()

print
print 'RE-READ:', len(reread_data)
print reread_data

Note

When re-reading the previously compressed data, I pass an explicit length to
read(). Leaving the length off resulted in a CRC error, possibly because
StringIO returned an empty string before reporting EOF. If you are
working with streams of compressed data, you may want to prefix the data with
an integer representing the actual amount of data to be read.

$ python gzip_StringIO.py
UNCOMPRESSED: 300
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.

COMPRESSED: 48
1f8b080097fc3b4902ff0ac94855284ecc4d55c8c9cc4bd551c82f4b2d5248cc4b0133f4b8424665916401000000ffff

RE-READ: 300
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.
The same line, over and over.

See also

gzip
The standard library documentation for this module.
zlib
The zlib module is a lower-level interface to gzip compression.
zipfile
The zipfile module gives access to ZIP archives.
bz2
The bz2 module uses the bzip2 compression format.
tarfile
The tarfile module includes built-in support for reading compressed tar archives.

PyMOTW Home


Updated 8 Dec to avoid using built-in names for local variables in some of the examples.

Saturday, December 6, 2008

Rewriting Matt Wilson's ugly code

Matt Wilson posted asking for help cleaning up some date manipulation code he had written. Here's my attempt:

import simplejson
from datetime import date, datetime, timedelta
import calendar

def get_dict_or_cookie_value(key, d, s):
value = d.get(key)
if not value and s.has_key(key) and s[key].value:
value = simplejson.loads(s[key].value)
return value

def get_start_and_stop_dates(d, s):

"""
Returns a tuple of datetime.date objects.

First checks dictionary d, then looks in the cookie s, then returns
the first and last day of the month.

We return values from the dictionary d, even if the values exist in
simple_cookie s:

>>> d = {'start_date':'12-07-2008', 'stop_date':'12-20-2008'}
>>> import Cookie, simplejson
>>> s = Cookie.SimpleCookie()
>>> s['start_date'] = simplejson.dumps('12-08-2008')
>>> s['stop_date'] = simplejson.dumps('12-11-2008')
>>> a, b = get_start_and_stop_dates(d, s)
>>> from datetime import date
>>> isinstance(a, date) and isinstance(b, date)
True
>>> a.strftime('%m-%d-%Y'), b.strftime('%m-%d-%Y')
('12-07-2008', '12-20-2008')

If the dictionary d doesn't have values, then we get them from the
simple_cookie object s:

>>> a, b = get_start_and_stop_dates({}, s)
>>> from datetime import date
>>> isinstance(a, date) and isinstance(b, date)
True
>>> a.strftime('%m-%d-%Y'), b.strftime('%m-%d-%Y')
('12-08-2008', '12-11-2008')

We handle mix-and-match scenarios, like where one value is in d and
another is in s:

>>> s2 = Cookie.SimpleCookie()
>>> s2['stop_date'] = simplejson.dumps('2-28-1975')
>>> get_start_and_stop_dates({'start_date':'2-17-1975'}, s2)
(datetime.date(1975, 2, 17), datetime.date(1975, 2, 28))

When just one of the dates is specified, then the other will be
the first/last day of the month containing the other date:

>>> get_start_and_stop_dates({'start_date':'2-17-1975'},
... Cookie.SimpleCookie())
(datetime.date(1975, 2, 17), datetime.date(1975, 2, 28))

>>> get_start_and_stop_dates({'stop_date':'2-17-1975'},
... Cookie.SimpleCookie())
(datetime.date(1975, 2, 1), datetime.date(1975, 2, 17))

Finally, we use the first and last days of the current month.
"""
# These are the dateformats that the dates will be in.
dateformats = ['%m-%d-%Y', '%Y-%m-%d', '%Y-%m-%d %H:%M:%S']

start_date = stop_date = None

# Figure out the start_date first.
start_date_source = get_dict_or_cookie_value('start_date', d, s)
if start_date_source:
start_date = stubborn_datetimeparser(start_date_source,
dateformats).date()

# Now repeat the process for stop_date.
# TODO: pull this redundancy into a single function and call it
# twice.
stop_date_source = get_dict_or_cookie_value('stop_date', d, s)
if stop_date_source:
stop_date = stubborn_datetimeparser(stop_date_source,
dateformats).date()

# Now figure out what to return. Remember, if we found one date,
# but not the other, then we return the first/last date of that month,
# not the current month.

if not start_date or not stop_date:
if start_date:
month_source = start_date
elif stop_date:
month_source = stop_date
else:
month_source = datetime.now()

if not start_date:
# first day of the month
start_date = date(month_source.year, month_source.month, 1)

if not stop_date:
# last day of the month
stop_date = date(month_source.year, month_source.month,
calendar.monthrange(month_source.year, month_source.month)[1])

return (start_date, stop_date)


def stubborn_datetimeparser(s, dateformats):
"""
Keep trying to parse s into a datetime object until we succeed or
run out of dateformats.

When the first format works, we immediately return:

>>> dateformats = ['%Y-%m-%d', '%m-%d-%Y', '%m-%d-%Y %H:%M']
>>> stubborn_datetimeparser('12-1-2008', dateformats)
datetime.datetime(2008, 12, 1, 0, 0)

Otherwise, we keep trying until we parse it:

>>> stubborn_datetimeparser('12-1-2008', dateformats)
datetime.datetime(2008, 12, 1, 0, 0)

>>> stubborn_datetimeparser('12-1-2008 15:47', dateformats)
datetime.datetime(2008, 12, 1, 15, 47)

or we run out of formats, and raise a ValueError:

>>> stubborn_datetimeparser('12/1/2008', dateformats)
Traceback (most recent call last):
...
ValueError: I couldn't parse '12/1/2008' with any of my formats!
"""

for datefmt in dateformats:
try:
return datetime.strptime(s, datefmt)

except ValueError:
pass

# This else matches the for datefmt in dateformats loop. It means
# that we didn't break out of the loop early.
else:
raise ValueError("I couldn't parse '%s' with any of my formats!" % s)


[Updated with all of the code. D'oh!]