Re: TAR reading/writing module...

Steven D. Majewski (sdm7g@elvis.med.virginia.edu)
Tue, 8 Feb 1994 00:54:23 -0500

On Feb 7, 15:15, Lance Ellinghouse wrote:
> Subject: TAR reading/writing module...
>
> I remember someone posting a module to read and extract files
> from a TAR archive... He had mentioned he was going to be working on adding
> writting a TAR file also...
>
> Does anyone know where I can get this? I would like to be able to
> create TAR files from inside a PYTHON program without using 'tar'.
>
> Basicly I need to insert a file into a TAR archive using the
> user, group, and mode that I choose nomatter what the actual
> user, group, and mode are on the file...
>

Here is what I have so far, Lance.
There are two classes defined:
Class TarHdr defines the tar file header.
TarFile is a sequence of headers and files.
The procedure at the end of the module list the contents of a tarfile.
It is read-only so far - I haven't implemented any methods to write
out a tarfile.

Sorry it's not documented. This was the first pass in trying to
figure out if I understood how to interpret the header. I was
researching some issues relating to adding extensions to support
MacBinary and some other non-unix file types before Christmas,
and since I've been back, besides being out with the flu, and a
brief break to build and install Python 1.0 ( now with readline,
but still without X or dld ) I have been working on other projects
( statistics and math programming )

There are some other holes which should be apparent from looking
at the various filetypes defined in the header code that I *don't*
yet make any attempt to handle - long file names and sparse files
for example. ( There were also vague plans afoot to bury implementation
level differences and cacheing strategies when reading small disk-resident
tarfiles versus enormous ( hundreds of MBytes ) tape resident files,
or files piped to stdin. )

I hope to get back to it soon, but if you can't wait, you're
welcome to take a hack at it yourself. ( just post or send me
a copy of your new improved version! )

One warning:

I did change the way defaults were handled a couple of times as
I tried it out on various "standard" files produced by tar,
gnutar and pax. ( for example, at least one of those programs,
but not all, can produce a null filename in some instances, so
the header parse must produce a null filename, and not NO filename
attribute, if you catch the diff. ). So if you mess with that
part, do try to test it on some different files to verify that
it doesn't crash and burn on a name or attribute error.

The primary applications planned:
A remote ftp to tape backup program.
A tar file browser ( with 'ls', 'cd', 'file' and 'more' commands. )
A tar file filter - to be inserted between tarfile and tar command
to change filenames, owners, or other parameters when de-taring.

- Steve Majewski (804-982-0831) <sdm7g@Virginia.EDU>
- UVA Department of Molecular Physiology and Biological Physics

#!/usr/local/bin/python
#
#
# Most of these 'defines' ( and their comments ) have been
# converted from gnutar's tar.h include file.
#
# - S.D.Majewski <sdm7g@Virginia.EDU>
#
_NULL = '\000'
_NAMSIZ = 100
_USIZE = 32
_RECSIZE = 512

_LF_OLDNORMAL = '\0' ## Normal disk file, Unix compat
_LF_NORMAL = '0' ## Normal disk file
_LF_LINK = '1' ## Link to previously dumped file
_LF_SYMLINK = '2' ## Symbolic link
_LF_CHR = '3' ## Character special file
_LF_BLK = '4' ## Block special file
_LF_DIR = '5' ## Directory
_LF_FIFO = '6' ## FIFO special file
_LF_CONTIG = '7' ## Contiguous file

## Further link types may be defined later.
##
## Note that the standards committee allows only capital A through
## capital Z for user-defined expansion. This means that defining something
## as, say '8' is a *bad* idea.

_LF_DUMPDIR = 'D' ## This is a dir entry that contains
## the names of files that were in the
## dir at the time the dump was made
_LF_LONGLINK = 'K' ## Identifies the NEXT file on the tape
## as having a long linkname
_LF_LONGNAME = 'L' ## Identifies the NEXT file on the tape
## as having a long name.
_LF_MULTIVOL = 'M' ## This is the continuation
## of a file that began on another
## volume
_LF_NAMES = 'N' ## For storing filenames that didn't
## fit in 100 characters
_LF_SPARSE = 'S' ## This is for sparse files
_LF_VOLHDR = 'V' ## This file is a tape/volume header

# The first character of text description is used for verbose directory listings
entry_types = { _LF_OLDNORMAL:' File',
_LF_NORMAL:'- normal file',
_LF_LINK:'- link',
_LF_SYMLINK:'l symbolic link',
_LF_CHR:'character special file',
_LF_BLK:'block special file',
_LF_DIR:'directory',
_LF_FIFO:'pipe/FIFO special file',
_LF_CONTIG:'contiguous file',
_LF_DUMPDIR:'dumpdir listing',
_LF_LONGLINK:'?Next file on tape has a long linkname',
_LF_LONGNAME:'?Next file on the tape has a longname',
_LF_MULTIVOL:'+Continuation of tape volume',
_LF_NAMES:'?Long filename',
_LF_SPARSE:'- Sparse File',
_LF_VOLHDR:'Volume/Tape Header' }

import time
from string import ljust,rjust,strip

_map = ( ( 'name', _NAMSIZ, 's' ),
( 'mode',8, 'o'), ('uid',8, 'o' ), ('gid',8, 'o' ),
( 'size', 12, 'o' ), ( 'mtime',12, 'o'), ('chksum',8, 'o'),
('linkflag',1, 's'),
( 'linkname', _NAMSIZ, 's' ),
( 'magic',8, 's'),
('uname',32,'s'), ('gname',32,'s'),
( 'devmaj',8,'o'), ('devmin',8,'o'),
( 'atime', 12, 'o' ), ( 'ctime', 12, 'o' ),
( 'offset', 12, 'o' ) )

def _zerostrip( s ):
news = ''
for c in s :
if c <> _NULL : news = news + c
return news

# This class defines a tar file header
class TarHdr:
def __init__( self, *hdrstr ):
if hdrstr : self.parse( hdrstr[0] )
def parse( self, h ):
if not h or not _zerostrip( h ) :
raise EOFError, 'Null Header'
i = j = 0
for item in _map:
i,j = j, j+item[1]
if item[-1] == 'o' :
val = strip(_zerostrip( h[i:j] ))
elif item[1] == 1 :
val = h[i:j]
else: val = _zerostrip( h[i:j] )
if item[-1] == 'o' :
val = '0'+val
try:
val = eval( val )
except OverflowError:
val = eval( val+'L' )
setattr( self, item[0], val )
def filetype( self ):
return entry_types[self.linkflag]
def _modestr( self ):
rwx = 'rwxrwxrwx'
str = ''
for i in range(len(rwx)):
bit = self.mode & ( 1 << i )
if bit : str = rwx[-(i+1)] + str
else: str = '-' + str
return str
def modestr( self ):
return entry_types[self.linkflag][0] + self._modestr()
def user( self ):
if hasattr( self, 'uname' ) and self.uname : return self.uname
else: return repr(self.uid)
def group( self ):
if hasattr( self, 'gname' ) and self.gname : return self.gname
else: return repr(self.gid)
def skip( self ):
blocks, extra = (self.size/_RECSIZE),(self.size % _RECSIZE)
if extra : blocks = blocks + 1
return blocks * _RECSIZE
def ugfmt( self, w ):
w = (w - 1) / 2
return rjust(self.user(),w)+ '/' + ljust(self.group(),w)
def string( self ):
timestr = time.ctime(self.mtime)
timestr = timestr[4:11] + timestr[-2:]
ug = ' [' + self.ugfmt(13) + ']'
return self.modestr() + ' ' + ljust(self.name, 35) + timestr + ug + rjust(repr(self.size),6)
def __repr__( self ):
return self.string()
#
# class TarFile is a sequence just a sequence of tar headers + the file
# contents and/or references to the (non-memory-resident) file contents.
#
class TarFile:
def __init__( self, *args ):
self.list = []
if args : _ = self.readfrom( args[0] )
def readfrom( self, file ):
self.file = file
while 'True' :
hdr = file.read( 512 )
if not hdr : return self
try:
HDR = TarHdr( hdr )
except EOFError:
return self
try:
HDR.where = file.tell()
except IOError:
HDR.where = None
HDR.file = ''
if HDR.size: HDR.file = file.read( HDR.size )
excess = HDR.skip() - HDR.size
if excess: _ = file.read( excess )
self.list.append( HDR )
def __len__( self ):
return len(self.list)
def __getitem__( self, i ):
return self.list[i]
def dir( self ):
print repr(self)
for H in self.list :
print H.string()
def ls( self ):
for H in self.list :
print H.name
def __repr__( self ):
return '< TarFile: ' + repr(self.file)[1:-1] + ' >'

def list( file ):
if type(file) == type(''):
file = open( file, 'r' )
TAR = TarFile( file )
TAR.dir()


import sys
if sys.argv[1:] :
for afile in sys.argv[1:]:
if afile == '-' : list( sys.stdin )
else: list( afile )