本篇文章我将「CASIA 脱机和在线手写汉字库」的特征数据汇总写入一个字典之中,以方便以后的使用。
import os
import sys
import zipfile, rarfile
import struct, pickle
import pandas as pd
import numpy as np
import tables as tb
import time
def getZ(filename):
name, end = os.path.splitext(filename)
if end == '.rar':
Z = rarfile.RarFile(filename)
elif end == '.zip':
Z = zipfile.ZipFile(filename)
return Z
class Bunch(dict):
def __init__(self, *args, **kwds):
super().__init__(*args, **kwds)
self.__dict__ = self
class MPF(Bunch):
def __init__(self, fp, *args, **kwds):
super().__init__(*args, **kwds)
self.fp = fp
header_size = struct.unpack('l', self.fp.read(4))[0]
self.code_format = self.fp.read(8).decode('ascii').rstrip('\x00')
self.text = self.fp.read(header_size - 62).decode().rstrip('\x00')
self.code_type = self.fp.read(20).decode('latin-1').rstrip('\x00')
self.code_length = struct.unpack('h', self.fp.read(2))[0]
self.data_type = self.fp.read(20).decode('ascii').rstrip('\x00')
self.nrows = struct.unpack('l', self.fp.read(4))[0]
self.ndims = struct.unpack('l', self.fp.read(4))[0]
def __iter__(self):
m = self.code_length + self.ndims
for i in range(0, m * self.nrows, m):
label = self.fp.read(self.code_length).decode('gb18030')
data = np.frombuffer(self.fp.read(self.ndims), np.uint8)
yield data, label
class Writer(Bunch):
def __init__(self, mpf, *args, **kwds):
'''
dtype 为 结构数组 array
可将其转换为 pandas:: pd.DataFrame.from_dict(dict(self.feature))
'''
super().__init__(*args, **kwds)
self.text = mpf.text
t = np.dtype([('label', 'U', 2), ('feature', np.uint8, 512)])
self.feature = np.array([(label, feature) for feature, label in iter(mpf)], dtype=t)
class Feature(Bunch):
def __init__(self, root, set_name, *args, **kwds):
super().__init__(*args, **kwds)
filename, end = os.path.splitext(set_name)
if 'HW' in filename and end == '.zip':
if '_' not in filename:
self.name = filename
Z = getZ(f'{root}{set_name}')
self._get_dataset(Z)
else:
#print(f'{filename}不是我们需要的文件!')
pass
def _get_dataset(self, Z):
for name in Z.namelist():
if name.endswith('.mpf'):
writer_ = f"writer{os.path.splitext(name)[0].split('/')[1]}"
with Z.open(name) as fp:
mpf = MPF(fp)
wt = Writer(mpf)
self[writer_] = wt
class XFeature(Bunch):
def __init__(self, root, *args, **kwds):
super().__init__(*args, **kwds)
for filename in os.listdir(root):
set_name, end = os.path.splitext(filename)
if 'HW' in filename and end == '.zip':
if '_' not in set_name:
setname = set_name.replace('.', '')
start = time.time()
self[setname] = Feature(root, filename)
print(f'{time.time() - start}秒,完成字典 {setname} 的创建!')
def bunch2json(bunch, path):
with open(path, 'wb') as fp:
pickle.dump(bunch, fp)
def json2bunch(path):
with open(path, 'rb') as fp:
X = pickle.load(fp)
return X
将数据特征存储为结构数组
%%time
root = 'E:/OCR/CASIA/'
mpf = XFeature(root)
17.253000497817993秒,完成字典 HWDB10trn 的创建!
4.3190014362335205秒,完成字典 HWDB10tst 的创建!
12.178295135498047秒,完成字典 HWDB11trn 的创建!
2.823002338409424秒,完成字典 HWDB11tst 的创建!
17.44099521636963秒,完成字典 OLHWDB10trn 的创建!
4.106986999511719秒,完成字典 OLHWDB10tst 的创建!
15.28876519203186秒,完成字典 OLHWDB11trn 的创建!
3.2720530033111572秒,完成字典 OLHWDB11tst 的创建!
Wall time: 1min 16s
将上述的 8 个字典写入本地磁盘,以便以后使用。
%%time
path = f'{root}mpf/feature.json'
bunch2json(mpf, path)
Wall time: 2min 9s
%%time
path = f'{root}mpf/feature.json'
xarray = json2bunch(path)
Wall time: 28.3 s
从本地载入字典只需要 28.3 秒,已经十分快速了。因为:
size = os.path.getsize(f'{root}mpf/feature.json') / 1e9
print(f'特征字典的大小是 {size} GB !')
特征字典的大小是 2.793863326 GB !
any(xarray) == any(mpf)
True
feature
字典的结构
整个 feature
字典就是一个树结构!
其中,HWDB10trn
等表示的是数据集的名字,而 feature
则表示 writer...
的特征数组,text
则是对该 feature
的简单描述。
如果想要再次使用已经封装好的 feature
数据,而不需要重新生成字典,我们需要导入 json2bunch, XFeature, Feature, Writer
(我将其打包为 xhw.py
脚本),例如:
import sys
sys.path.append('E:/xlab')
from base.xhw import json2bunch, XFeature, Feature, Writer
root = 'E:/OCR/CASIA/'
path = f'{root}mpf/feature.json'
feature = json2bunch(path) # 这就是我们需要的特征数据字典