慕后森
假设字符集只是字母数字,即 az AZ 0-9,这需要每个字符 6 位。因此,使用 8 位字节编码在理论上是对内存的低效使用。此答案将输入字节转换为 6 位整数序列。它使用按位运算将这些小整数编码为一个大整数。这是否真的转化为现实世界的存储效率是由 来衡量的sys.getsizeof,对于更大的字符串更有可能。此实现自定义了字符集选择的编码。例如,如果您只使用string.ascii_lowercase(5 位)而不是string.ascii_uppercase + string.digits(6 位),则编码将相应地高效。单元测试也包括在内。import stringclass BytesIntEncoder: def __init__(self, chars: bytes = (string.ascii_letters + string.digits).encode()): num_chars = len(chars) translation = ''.join(chr(i) for i in range(1, num_chars + 1)).encode() self._translation_table = bytes.maketrans(chars, translation) self._reverse_translation_table = bytes.maketrans(translation, chars) self._num_bits_per_char = (num_chars + 1).bit_length() def encode(self, chars: bytes) -> int: num_bits_per_char = self._num_bits_per_char output, bit_idx = 0, 0 for chr_idx in chars.translate(self._translation_table): output |= (chr_idx << bit_idx) bit_idx += num_bits_per_char return output def decode(self, i: int) -> bytes: maxint = (2 ** self._num_bits_per_char) - 1 output = bytes(((i >> offset) & maxint) for offset in range(0, i.bit_length(), self._num_bits_per_char)) return output.translate(self._reverse_translation_table)# Testimport itertoolsimport randomimport unittestclass TestBytesIntEncoder(unittest.TestCase): chars = string.ascii_letters + string.digits encoder = BytesIntEncoder(chars.encode()) def _test_encoding(self, b_in: bytes): i = self.encoder.encode(b_in) self.assertIsInstance(i, int) b_out = self.encoder.decode(i) self.assertIsInstance(b_out, bytes) self.assertEqual(b_in, b_out) # print(b_in, i) def test_thoroughly_with_small_str(self): for s_len in range(4): for s in itertools.combinations_with_replacement(self.chars, s_len): s = ''.join(s) b_in = s.encode() self._test_encoding(b_in) def test_randomly_with_large_str(self): for s_len in range(256): num_samples = {s_len <= 16: 2 ** s_len, 16 < s_len <= 32: s_len ** 2, s_len > 32: s_len * 2, s_len > 64: s_len, s_len > 128: 2}[True] # print(s_len, num_samples) for _ in range(num_samples): b_in = ''.join(random.choices(self.chars, k=s_len)).encode() self._test_encoding(b_in)if __name__ == '__main__': unittest.main()用法示例:>>> encoder = BytesIntEncoder()>>> s = 'Test123'>>> b = s.encode()>>> bb'Test123'>>> encoder.encode(b)3908257788270>>> encoder.decode(_)b'Test123'