# -*- coding: windows-1252 -*- import sys import struct class Reader: def __init__(self, filename, dump = False): self.dump = dump self.STREAMS = {} doc = file(filename, 'rb').read() self.header, self.data = doc[0:512], doc[512:] del doc self.__build_header() self.__build_MSAT() self.__build_SAT() self.__build_directory() self.__build_short_sectors_data() if len(self.short_sectors_data) > 0: self.__build_SSAT() else: if self.dump and (self.total_ssat_sectors != 0 or self.ssat_start_sid != -2): print 'NOTE: header says that must be', self.total_ssat_sectors, 'short sectors' print 'NOTE: starting at', self.ssat_start_sid, 'sector' print 'NOTE: but file does not contains data in short sectors' self.ssat_start_sid = -2 self.total_ssat_sectors = 0 self.SSAT = [-2] for dentry in self.dir_entry_list[1:]: (did, sz, name, t, c, did_left, did_right, did_root, dentry_start_sid, stream_size ) = dentry stream_data = '' if stream_size > 0: if stream_size >= self.min_stream_size: args = (self.data, self.SAT, dentry_start_sid, self.sect_size) else: args = (self.short_sectors_data, self.SSAT, dentry_start_sid, self.short_sect_size) stream_data = self.get_stream_data(*args) if name != '': # BAD IDEA: names may be equal. NEED use full paths... self.STREAMS[name] = stream_data def __build_header(self): self.doc_magic = self.header[0:8] if self.doc_magic != '\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1': raise Exception, 'Not an OLE file.' self.file_uid = self.header[8:24] self.rev_num = self.header[24:26] self.ver_num = self.header[26:28] self.byte_order = self.header[28:30] self.log2_sect_size, = struct.unpack(' 0: msat_sector = struct.unpack('<128l', self.data[next*self.sect_size:(next+1)*self.sect_size]) self.MSAT.extend(msat_sector[:127]) next = msat_sector[-1] if self.dump: print 'MSAT (header part): \n', self.MSAT[:109] print 'additional MSAT sectors: \n', self.MSAT[109:] def __build_SAT(self): sat_stream = ''.join([self.data[i*self.sect_size:(i+1)*self.sect_size] for i in self.MSAT if i >= 0]) sat_sids_count = len(sat_stream) >> 2 self.SAT = struct.unpack('<%dl' % sat_sids_count, sat_stream) # SIDs tuple if self.dump: print 'SAT sid count:\n', sat_sids_count print 'SAT content:\n', self.SAT def __build_SSAT(self): ssat_stream = self.get_stream_data(self.data, self.SAT, self.ssat_start_sid, self.sect_size) ssids_count = len(ssat_stream) >> 2 self.SSAT = struct.unpack('<%dl' % ssids_count, ssat_stream) if self.dump: print 'SSID count:', ssids_count print 'SSAT content:\n', self.SSAT def __build_directory(self): dir_stream = self.get_stream_data(self.data, self.SAT, self.dir_start_sid, self.sect_size) self.dir_entry_list = [] i = 0 while i < len(dir_stream): dentry = dir_stream[i:i+128] # 128 -- dir entry size i += 128 did = len(self.dir_entry_list) sz, = struct.unpack(' 0 : name = dentry[0:sz-2].decode('utf_16_le', 'replace') else: name = u'' t, = struct.unpack('B', dentry[66]) c, = struct.unpack('B', dentry[67]) did_left , = struct.unpack('= self.min_stream_size: print 'stream stored as normal stream' else: print 'stream stored as short-stream' def __build_short_sectors_data(self): (did, sz, name, t, c, did_left, did_right, did_root, dentry_start_sid, stream_size) = self.dir_entry_list[0] assert t == 0x05 # Short-Stream Container Stream (SSCS) resides in Root Storage if stream_size == 0: self.short_sectors_data = '' else: self.short_sectors_data = self.get_stream_data(self.data, self.SAT, dentry_start_sid, self.sect_size) def get_stream_data(self, data, SAT, start_sid, sect_size): sid = start_sid chunks = [(sid, sid)] stream_data = '' while SAT[sid] >= 0: next_in_chain = SAT[sid] last_chunk_start, last_chunk_finish = chunks[-1] if next_in_chain == last_chunk_finish + 1: chunks[-1] = last_chunk_start, next_in_chain else: chunks.extend([(next_in_chain, next_in_chain)]) sid = next_in_chain for s, f in chunks: stream_data += data[s*sect_size:(f+1)*sect_size] #print chunks return stream_data def print_bin_data(data): i = 0 while i < len(data): j = 0 while (i < len(data)) and (j < 16): c = '0x%02X' % ord(data[i]) sys.stdout.write(c) sys.stdout.write(' ') i += 1 j += 1 print if i == 0: print '' # This implementation writes only 'Root Entry', 'Workbook' streams # and 2 empty streams for aligning directory stream on sector boundary # # LAYOUT: # 0 header # 76 MSAT (1st part: 109 SID) # 512 workbook stream # ... additional MSAT sectors if streams' size > about 7 Mb == (109*512 * 128) # ... SAT # ... directory stream # # NOTE: this layout is "ad hoc". It can be more general. RTFM class XlsDoc: SECTOR_SIZE = 0x0200 MIN_LIMIT = 0x1000 SID_FREE_SECTOR = -1 SID_END_OF_CHAIN = -2 SID_USED_BY_SAT = -3 SID_USED_BY_MSAT = -4 def __init__(self): #self.book_stream = '' # padded self.book_stream_sect = [] self.dir_stream = '' self.dir_stream_sect = [] self.packed_SAT = '' self.SAT_sect = [] self.packed_MSAT_1st = '' self.packed_MSAT_2nd = '' self.MSAT_sect_2nd = [] self.header = '' def __build_directory(self): # align on sector boundary self.dir_stream = '' dentry_name = '\x00'.join('Root Entry\x00') + '\x00' dentry_name_sz = len(dentry_name) dentry_name_pad = '\x00'*(64 - dentry_name_sz) dentry_type = 0x05 # root storage dentry_colour = 0x01 # black dentry_did_left = -1 dentry_did_right = -1 dentry_did_root = 1 dentry_start_sid = -2 dentry_stream_sz = 0 self.dir_stream += struct.pack('<64s H 2B 3l 9L l L L', dentry_name + dentry_name_pad, dentry_name_sz, dentry_type, dentry_colour, dentry_did_left, dentry_did_right, dentry_did_root, 0, 0, 0, 0, 0, 0, 0, 0, 0, dentry_start_sid, dentry_stream_sz, 0 ) dentry_name = '\x00'.join('Workbook\x00') + '\x00' dentry_name_sz = len(dentry_name) dentry_name_pad = '\x00'*(64 - dentry_name_sz) dentry_type = 0x02 # user stream dentry_colour = 0x01 # black dentry_did_left = -1 dentry_did_right = -1 dentry_did_root = -1 dentry_start_sid = 0 dentry_stream_sz = self.book_stream_len self.dir_stream += struct.pack('<64s H 2B 3l 9L l L L', dentry_name + dentry_name_pad, dentry_name_sz, dentry_type, dentry_colour, dentry_did_left, dentry_did_right, dentry_did_root, 0, 0, 0, 0, 0, 0, 0, 0, 0, dentry_start_sid, dentry_stream_sz, 0 ) # padding dentry_name = '' dentry_name_sz = len(dentry_name) dentry_name_pad = '\x00'*(64 - dentry_name_sz) dentry_type = 0x00 # empty dentry_colour = 0x01 # black dentry_did_left = -1 dentry_did_right = -1 dentry_did_root = -1 dentry_start_sid = -2 dentry_stream_sz = 0 self.dir_stream += struct.pack('<64s H 2B 3l 9L l L L', dentry_name + dentry_name_pad, dentry_name_sz, dentry_type, dentry_colour, dentry_did_left, dentry_did_right, dentry_did_root, 0, 0, 0, 0, 0, 0, 0, 0, 0, dentry_start_sid, dentry_stream_sz, 0 ) * 2 def __build_sat(self): # Build SAT book_sect_count = self.book_stream_len >> 9 dir_sect_count = len(self.dir_stream) >> 9 total_sect_count = book_sect_count + dir_sect_count SAT_sect_count = 0 MSAT_sect_count = 0 SAT_sect_count_limit = 109 while total_sect_count > 128*SAT_sect_count or SAT_sect_count > SAT_sect_count_limit: SAT_sect_count += 1 total_sect_count += 1 if SAT_sect_count > SAT_sect_count_limit: MSAT_sect_count += 1 total_sect_count += 1 SAT_sect_count_limit += 127 SAT = [self.SID_FREE_SECTOR]*128*SAT_sect_count sect = 0 while sect < book_sect_count - 1: self.book_stream_sect.append(sect) SAT[sect] = sect + 1 sect += 1 self.book_stream_sect.append(sect) SAT[sect] = self.SID_END_OF_CHAIN sect += 1 while sect < book_sect_count + MSAT_sect_count: self.MSAT_sect_2nd.append(sect) SAT[sect] = self.SID_USED_BY_MSAT sect += 1 while sect < book_sect_count + MSAT_sect_count + SAT_sect_count: self.SAT_sect.append(sect) SAT[sect] = self.SID_USED_BY_SAT sect += 1 while sect < book_sect_count + MSAT_sect_count + SAT_sect_count + dir_sect_count - 1: self.dir_stream_sect.append(sect) SAT[sect] = sect + 1 sect += 1 self.dir_stream_sect.append(sect) SAT[sect] = self.SID_END_OF_CHAIN sect += 1 self.packed_SAT = struct.pack('<%dl' % (SAT_sect_count*128), *SAT) MSAT_1st = [self.SID_FREE_SECTOR]*109 for i, SAT_sect_num in zip(range(0, 109), self.SAT_sect): MSAT_1st[i] = SAT_sect_num self.packed_MSAT_1st = struct.pack('<109l', *MSAT_1st) MSAT_2nd = [self.SID_FREE_SECTOR]*128*MSAT_sect_count if MSAT_sect_count > 0: MSAT_2nd[- 1] = self.SID_END_OF_CHAIN i = 109 msat_sect = 0 sid_num = 0 while i < SAT_sect_count: if (sid_num + 1) % 128 == 0: #print 'link: ', msat_sect += 1 if msat_sect < len(self.MSAT_sect_2nd): MSAT_2nd[sid_num] = self.MSAT_sect_2nd[msat_sect] else: #print 'sid: ', MSAT_2nd[sid_num] = self.SAT_sect[i] i += 1 #print sid_num, MSAT_2nd[sid_num] sid_num += 1 self.packed_MSAT_2nd = struct.pack('<%dl' % (MSAT_sect_count*128), *MSAT_2nd) #print vars() #print zip(range(0, sect), SAT) #print self.book_stream_sect #print self.MSAT_sect_2nd #print MSAT_2nd #print self.SAT_sect #print self.dir_stream_sect def __build_header(self): doc_magic = '\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1' file_uid = '\x00'*16 rev_num = '\x3E\x00' ver_num = '\x03\x00' byte_order = '\xFE\xFF' log_sect_size = struct.pack('