本篇文章我将「CASIA 脱机和在线手写汉字库」的特征数据汇总写入一个字典之中，以方便以后的使用。

import os
import sys
import zipfile, rarfile
import struct, pickle
import pandas as pd
import numpy as np
import tables as tb
import time


def getZ(filename):
    name, end = os.path.splitext(filename)
    if end == '.rar':
        Z = rarfile.RarFile(filename)
    elif end == '.zip':
        Z = zipfile.ZipFile(filename)
    return Z


class Bunch(dict):

    def __init__(self, *args, **kwds):
        super().__init__(*args, **kwds)
        self.__dict__ = self

        
class MPF(Bunch):

    def __init__(self, fp, *args, **kwds):
        super().__init__(*args, **kwds)
        self.fp = fp
        header_size = struct.unpack('l', self.fp.read(4))[0]
        self.code_format = self.fp.read(8).decode('ascii').rstrip('\x00')
        self.text = self.fp.read(header_size - 62).decode().rstrip('\x00')
        self.code_type = self.fp.read(20).decode('latin-1').rstrip('\x00')
        self.code_length = struct.unpack('h', self.fp.read(2))[0]
        self.data_type = self.fp.read(20).decode('ascii').rstrip('\x00')
        self.nrows = struct.unpack('l', self.fp.read(4))[0]
        self.ndims = struct.unpack('l', self.fp.read(4))[0]

    def __iter__(self):
        m = self.code_length + self.ndims 
        for i in range(0, m * self.nrows, m):
            label = self.fp.read(self.code_length).decode('gb18030')
            data = np.frombuffer(self.fp.read(self.ndims), np.uint8)
            yield data, label

            
class Writer(Bunch):
    
    def __init__(self, mpf, *args, **kwds):
        '''
        dtype 为 结构数组 array
        可将其转换为 pandas:: pd.DataFrame.from_dict(dict(self.feature))
        '''
        super().__init__(*args, **kwds)
        self.text = mpf.text
        t = np.dtype([('label', 'U', 2), ('feature', np.uint8, 512)])
        self.feature = np.array([(label, feature) for feature, label in iter(mpf)], dtype=t)

        
class Feature(Bunch):

    def __init__(self, root, set_name, *args, **kwds):
        super().__init__(*args, **kwds)
        filename, end = os.path.splitext(set_name)

        if 'HW' in filename and end == '.zip':
            if '_' not in filename:
                self.name = filename
                Z = getZ(f'{root}{set_name}')
                self._get_dataset(Z)
        else:
            #print(f'{filename}不是我们需要的文件！')
            pass

    def _get_dataset(self, Z):
        for name in Z.namelist():
            if name.endswith('.mpf'):
                writer_ = f"writer{os.path.splitext(name)[0].split('/')[1]}"

                with Z.open(name) as fp:
                    mpf = MPF(fp)
                    wt = Writer(mpf)
                    self[writer_] = wt
                    

class XFeature(Bunch):
    
     def __init__(self, root, *args, **kwds):
        super().__init__(*args, **kwds) 
        for filename in os.listdir(root):
            set_name, end = os.path.splitext(filename)
            if 'HW' in filename and end == '.zip':
                if '_' not in set_name:
                    setname = set_name.replace('.', '')
                    start = time.time()
                    self[setname] = Feature(root, filename)
                    print(f'{time.time() - start}秒，完成字典 {setname} 的创建！')
                    
                    
def bunch2json(bunch, path):
    with open(path, 'wb') as fp:
        pickle.dump(bunch, fp)
        
def json2bunch(path):
    with open(path, 'rb') as fp:
        X = pickle.load(fp)
    return X

将数据特征存储为结构数组

%%time
root = 'E:/OCR/CASIA/'

mpf = XFeature(root)

17.253000497817993秒，完成字典 HWDB10trn 的创建！
4.3190014362335205秒，完成字典 HWDB10tst 的创建！
12.178295135498047秒，完成字典 HWDB11trn 的创建！
2.823002338409424秒，完成字典 HWDB11tst 的创建！
17.44099521636963秒，完成字典 OLHWDB10trn 的创建！
4.106986999511719秒，完成字典 OLHWDB10tst 的创建！
15.28876519203186秒，完成字典 OLHWDB11trn 的创建！
3.2720530033111572秒，完成字典 OLHWDB11tst 的创建！
Wall time: 1min 16s

将上述的 8 个字典写入本地磁盘，以便以后使用。

%%time 
path = f'{root}mpf/feature.json'
bunch2json(mpf, path)

Wall time: 2min 9s

%%time 
path = f'{root}mpf/feature.json'

xarray = json2bunch(path)

Wall time: 28.3 s

从本地载入字典只需要 28.3 秒，已经十分快速了。因为：

size = os.path.getsize(f'{root}mpf/feature.json') / 1e9
print(f'特征字典的大小是 {size} GB !')

特征字典的大小是 2.793863326 GB !

any(xarray) == any(mpf)

True

`feature` 字典的结构

图片描述

整个 feature 字典就是一个树结构！

其中，HWDB10trn 等表示的是数据集的名字，而 feature 则表示 writer... 的特征数组，text 则是对该 feature 的简单描述。

如果想要再次使用已经封装好的 feature 数据，而不需要重新生成字典，我们需要导入 json2bunch, XFeature, Feature, Writer（我将其打包为 xhw.py 脚本），例如：

import sys

sys.path.append('E:/xlab')
from base.xhw import json2bunch, XFeature, Feature, Writer

root = 'E:/OCR/CASIA/' 
path = f'{root}mpf/feature.json' 

feature = json2bunch(path)   # 这就是我们需要的特征数据字典

python 获取 CASIA 脱机和在线手写汉字库 （二）原创

将数据特征存储为结构数组

feature 字典的结构

python 获取 CASIA 脱机和在线手写汉字库（二）原创

`feature` 字典的结构