Efficiently reading data from tgz archive

If I am about to implement something it usually turns out to be a good idea to first start with some prototypes & benchmarks. This post summarizes the outcome of such an experiment to read data from compressed tar archives effectively. I did some similar benchmarks about 30 month ago so I repeated the measurements to see if there have been improvements and added results for my new Go reader.

DISCLAIMER: I am not doing this to pick on nodejs. I repeated the measurements in the hope that the performance issues had been fixed by now.

Results are the average time measurements (usr + sys) from reading a 277 kB file a 1000 times.

Language	Time (lower is better)
Go:	4.808 ms
Python:	54.272 ms
Node.js:	324.432 ms

The performance of the Python implementation already is impressive. But the result from implementing this in Golang is amazing. Wow, an advantage of 10x!

Sourcecode of the nodejs implementation:

var fs = require('fs');
var tar = require('tar');
var zlib = require('zlib');
var Stream = require('stream');

fs.createReadStream('supercars-logs-13060317.tgz')
    .pipe(zlib.Unzip())
    .pipe(tar.Parse())
    .on('entry', function(entry) {
        if(entry.path == 'responsetimes.log.13060317') {
            var bytes = 0;
            entry.on('end', function() {
                console.log('bytes: ' + bytes);
            });
            entry.on('data', function(data) {
                bytes += data.length;
            })
        }
    });

Sourcecode of the Python implementation:

#!/usr/bin/env python

import sys
import tarfile
import timeit

def count():
    bytes = 0

    tar = tarfile.open('supercars-logs-13060317.tgz', 'r:gz')

    f = tar.extractfile('responsetimes.log.13060317')

    for line in f:
        bytes += len(line)

    print "bytes: %d" % bytes

count()

Sourcecode of the Go implementation:

package main

import (
    "archive/tar"
    "compress/gzip"
    "fmt"
    "os"
    "bufio"
)

func main() {
    fmt.Printf("bytes: %d\n",
        countBytes("supercars-logs-13060317.tgz", "responsetimes.log.13060317"))
}

func countBytes(archive string, filename string) int {
    f, err := os.Open(archive)
    if err != nil {
        fmt.Println(err)
        os.Exit(1)
    }
    defer f.Close()

    gzf, err := gzip.NewReader(f)
    if err != nil {
        fmt.Println(err)
        os.Exit(1)
    }

    tarReader := tar.NewReader(gzf)

    // search filename in archive
    for {
        header, err := tarReader.Next()
        if err != nil {
            fmt.Println(err)
            os.Exit(1)
        }

        if header.Typeflag == tar.TypeReg && header.Name == filename {
            count := 0
            scanner := bufio.NewScanner(tarReader)
            scanner.Split(bufio.ScanBytes)

            for scanner.Scan() { count++ }
            return count
        }
    }
}

Best, Mark

Resources

http://stackoverflow.com/questions/17152307/is-something-wrong-with-how-i-read-tgz-files-in-node-js-benchmark-says-it-is-sl