kindle mobi词典格式分析及代码实现
解析代码语言c++
1.关于mobi正文分块
https://wiki.mobileread.com/wiki/PDB#Palm_Database_Format
mobi正文分了若干块正文块的信息在文章开头偏移78个字节的位置
std::vector Offsets;记录正文块的起始位置通过PdbStream:: open() -> PdbHeader::read()调用生成,代码如下:
bool PdbHeader::read(shared_ptr<ZLInputStream> stream) {
const size_t startOffset = stream->offset();
LOGI("yun startOffset : %d",stream->offset());
DocName.erase();
DocName.append(32, '\0');
stream->read((char*)DocName.data(), 32); // stream offset: +32
Flags = PdbUtil::readUnsignedShort(*stream); // stream offset: +34
stream->seek(26, false); // stream offset: +60
Id.erase();
Id.append(8, '\0');
stream->read((char*)Id.data(), 8); // stream offset: +68
stream->seek(8, false); // stream offset: +76
Offsets.clear();
const unsigned short numRecords = PdbUtil::readUnsignedShort(*stream); // stream offset: +78
Offsets.reserve(numRecords);
LOGI("yun startOffset : %d numRecords : %d ",stream->offset(),numRecords);
for (int i = 0; i < numRecords; ++i) {// stream offset: +78 + 8 * records number
const unsigned long recordOffset = PdbUtil::readUnsignedLongBE(*stream);
Offsets.push_back(recordOffset);
stream->seek(4, false);
}
return stream->offset() == startOffset + 78 + 8 * numRecords;
}
2.关于mobi头文件
详情见:https://wiki.mobileread.com/wiki/MOBI#Format
字段位置都要加上 startOffset + 78 + 8 * numRecords偏移
我们现在只关注需要的信息:myCompressionVersion 字段0(长度2byte)词典压缩方式 1.不压缩,2.普通压缩,17480.高压缩模式
判断mobi是否为一本词典的inputLanguage标志为字段96(长度4byte)和outputLanguage标志字段100(长度4byte),
metaOrthIndex判断是否有索引为60 (长度4byte)和myMetainflindex(长度4byte)该值表示索引变形的起点 64(如find过去式 found)
3.关于词典索引
词典有多个个索引区,metaOrthIndex为索引区的开始位置,每个区解压后最多为65536个字节,索引区有 idex判断该区是否是索引区,第一个索引区一般比较小,包含的信息有索引块的总个数,索引可能用到的公共字符count索引的个数,hordt1公共字符块,每个索引都占8个字节前四个字节索引内容所在的位置startpos,后四字节索引在正文所在的位置endpos。索引内容是连续的并且为有序的,排序方式为去掉标点符号和数字后并且大写转小写后得到的usc-2对应的顺序。
4.关于查词匹配索引
由于词典索引是有序的,查词的为二分查询效率最佳,先查每个区的头一个索引确定是哪个区,再在区内进行查询;以下为匹配算法
bool PalmDocStream::loadIndexInfoForDict(std::string wordName) {
ZLUnicodeUtil::Ucs2String name;
ZLUnicodeUtil::utf8ToUcs2(name, wordName);//将要查的词转为ucs2
// if (name.size() > 4)
// LOGI("yun text3 : %ld %ld %ld %ld", name[0], name[1], name[2], name[3]);
//for (int i = 0; i < name.size(); i++)
//LOGI("yun text3 : %d ", name[i]);
bool isSuccess = false;
if (myMetaOrthIndex != 0xFFFFFFFF) {//判断是否有词典索引
int curOffset = myBase->offset();
//LOGI("yun text start2 %d",curOffset);
//LOGI("yun text myMetaOrthIndex %d",myMetaOrthIndex);
size_t hdrLen = header().Offsets[myMetaOrthIndex + 1] - header().Offsets[myMetaOrthIndex];
char data[hdrLen];
myBase->seek((int) header().Offsets[myMetaOrthIndex], true);
std::string string;
myBase->read(data, hdrLen);
S_INDXHeader indxHeader;
bool isOk = parseINDXHeader(data, hdrLen, indxHeader);
if (isOk) {//判断词典索引是否为空
S_INDXDict indxD;
/**
* 生成有用数据:indxD.ordt2,用于索引内容的转码
* indxD.otype转码的辅助参数
*/
parseINDXHeaderForDictiony(data, indxD, 0xfdea == indxHeader.code);
std::vector<S_SectionTag> tagTable;
size_t controlByteCount = readTagSection(indxHeader.len, data, tagTable);
std::string lastText;
long start = myMetaOrthIndex + 1, end = myMetaOrthIndex + 1 + indxHeader.count, mid =
(start + end) / 2;
/**
* 以下为二分法法查找首先查到是那个块,再在索引块下进行二分法,由于可能出现多个数据查到后会进行前后匹配(目前未作跨块匹配)
*/
while (start <= end) {
//LOGI("yun text1 : %d %d", start, end);
if (mid > start) {
size_t hdrLen = header().Offsets[mid + 1] - header().Offsets[mid];
char hdr[hdrLen];
myBase->seek((int) header().Offsets[mid], true);
myBase->read(hdr, hdrLen);
S_INDXHeader indxHeader2;
bool isOk = parseINDXHeader(hdr, hdrLen, indxHeader2);
if (isOk) {
char pos_data0[2] = {hdr[indxHeader2.start + 4],
hdr[indxHeader2.start + 5]};
size_t startPos = PdbUtil::readUnsignedShort(pos_data0);
if (startPos > hdrLen)
break;
ZLUnicodeUtil::Ucs2String startString = getIndexString(startPos, indxD,
hdr);
long compare = ZLUnicodeUtil::compareTo(name, startString);
if (compare > 0) {
start = mid;
} else if (compare < 0) {
end = mid;
} else {
int endPos = startPos;
int lastIndex = 0;
for (int i = 0; i < indxHeader2.count + 1; i++) {
startPos = endPos;
if (i == indxHeader2.count) {
endPos = indxHeader2.start;
} else {
char pos_data2[2] = {hdr[indxHeader2.start + 4 + 2 * i],
hdr[indxHeader2.start + 5 + 2 * i]};
endPos = PdbUtil::readUnsignedShort(pos_data2);
}
lastIndex = getIndexTable(startPos, endPos, indxD, hdr,
controlByteCount, tagTable, name,
indexTable);
if (lastIndex == 0)
break;
}
break;
}
mid = (start + end) / 2;
} else {
break;
}
} else {
size_t hdrLen = header().Offsets[mid + 1] - header().Offsets[mid];
char hdr[hdrLen];
myBase->seek((int) header().Offsets[mid], true);
myBase->read(hdr, hdrLen);
S_INDXHeader indxHeader2;
bool isOk = parseINDXHeader(hdr, hdrLen, indxHeader2);
if (isOk) {
long start = 0, end = indxHeader2.count, mid = (start + end) / 2;
while (start <= end) {
char pos_data0[2] = {hdr[indxHeader2.start + 4 + 2 * mid],
hdr[indxHeader2.start + 5 + 2 * mid]};
int startPos = PdbUtil::readUnsignedShort(pos_data0);
if (startPos > hdrLen)
break;
ZLUnicodeUtil::Ucs2String startString = getIndexString(startPos, indxD,
hdr);
//std::string text;
//ZLUnicodeUtil::ucs2ToUtf8(text, startString);
long compare = ZLUnicodeUtil::compareTo(name, startString);
if (compare > 0) {
start = mid + 1;
} else if (compare < 0) {
end = mid - 1;
} else {
int endPos = startPos;
int lastIndex = 0;
for (int i = mid + 1; i < indxHeader2.count + 1; i++) {
startPos = endPos;
if (i == indxHeader2.count) {
endPos = indxHeader2.start;
} else {
char pos_data2[2] = {hdr[indxHeader2.start + 4 + 2 * i],
hdr[indxHeader2.start + 5 + 2 * i]};
endPos = PdbUtil::readUnsignedShort(pos_data2);
}
lastIndex = getIndexTable(startPos, endPos, indxD, hdr,
controlByteCount, tagTable, name,
indexTable);
if (lastIndex == 0)
break;
}
if (mid > 2) {
startPos = PdbUtil::readUnsignedShort(pos_data0);
for (int i = mid - 1; i > 0; i--) {
endPos = startPos;
char pos_data2[2] = {hdr[indxHeader2.start + 4 + 2 * i],
hdr[indxHeader2.start + 5 + 2 * i]};
startPos = PdbUtil::readUnsignedShort(pos_data2);
lastIndex = getIndexTable(startPos, endPos, indxD, hdr,
controlByteCount, tagTable, name,
indexTable);
if (lastIndex == 0)
break;
}
}
break;
}
mid = (start + end) / 2;
}
}
break;
}
}
}
myBase->seek(curOffset, true);
}
return isSuccess;
}
5.关于查询正文
主要是正文跳转,以下为正文快速跳转代码:
bool PalmDocStream::fastSeek(size_t offset) {
int maxRecordSize = myMaxRecordSize;
size_t offsetNum = offset / maxRecordSize;
if (myCompressionVersion == 17480) {
if (myOffset != 0) {
myBufferOffset = 0;
myBufferLength = 0;
}
for (size_t i = 0; i < datp.size(); ++i) {
myOffset += datp[i];
if (myOffset > offset) {
myOffset -= datp[i];
myRecordIndex = i;
break;
}
}
if (myOffset != 0 && myRecordIndex == 0) {
myMaxRecordIndex = datp.size() - 1;
}myCompressionVersion);
seek(offset - myOffset, false);
} else if (offsetNum + 1 != myRecordIndex &&
offsetNum + 1 != myRecordIndex) {
if (myOffset != 0 && myOffset % maxRecordSize != 0) {
myBufferOffset = 0;
myBufferLength = 0;
}
myOffset = offsetNum * maxRecordSize;
myRecordIndex = offsetNum;
seek(offset - myOffset, false);
} else {
seek(offset, true);
}
return true;
}
当高压缩模式还需要解析出每个块解压后的大小:
if (myCompressionVersion == 17480) {
unsigned long mobiHeaderLength;
unsigned long huffSectionIndex;
unsigned long huffSectionNumber;
unsigned long huffTblOffset;
unsigned long huffTblNumber;
unsigned long extraFlags = 0;
unsigned long initialOffset = header().Offsets[header().Start + 0];
myBase->seek(initialOffset + 20,
true); // myBase offset: ^ + 20
mobiHeaderLength = PdbUtil::readUnsignedLongBE(*myBase); // myBase offset: ^ + 24
myBase->seek(initialOffset + 112,
true); // myBase offset: ^ + 112
huffSectionIndex = PdbUtil::readUnsignedLongBE(*myBase); // myBase offset: ^ + 116
huffSectionNumber = PdbUtil::readUnsignedLongBE(*myBase); // myBase offset: ^ + 120
huffTblOffset = PdbUtil::readUnsignedLongBE(*myBase); // myBase offset: ^ + 124
huffTblNumber = PdbUtil::readUnsignedLongBE(*myBase); // myBase offset: ^ + 128
if (16 + mobiHeaderLength >= 244) {
myBase->seek(initialOffset + 240,
true); // myBase offset: ^ + 240
extraFlags = PdbUtil::readUnsignedLongBE(*myBase); // myBase offset: ^ + 244
}
const unsigned long endHuffSectionIndex = huffSectionIndex + huffSectionNumber;
if (endHuffSectionIndex > endSectionIndex || huffSectionNumber <= 1) {
myErrorCode = ERROR_COMPRESSION;
return false;
}
const unsigned long endHuffDataOffset = recordOffset(endHuffSectionIndex);
std::vector<unsigned long>::const_iterator beginHuffSectionOffsetIt =
header().Offsets.begin() + huffSectionIndex;
// point to first Huff section
std::vector<unsigned long>::const_iterator endHuffSectionOffsetIt =
header().Offsets.begin() + endHuffSectionIndex;
// point behind last Huff section
// LOGI("mobi******attachStorage beginHuffSectionOffsetIt : %d endHuffSectionOffsetIt : %d ,endHuffDataOffset : %d ,myBase->sizeOfOpened() : %d",
// beginHuffSectionOffsetIt,endHuffSectionOffsetIt,endHuffDataOffset,myBase->sizeOfOpened());
myHuffDecompressorPtr = new HuffDecompressor(*myBase, beginHuffSectionOffsetIt,
endHuffSectionOffsetIt, endHuffDataOffset,
extraFlags);
datp.clear();
//LOGI("yun %d huffTblNumber : %d",huffTblOffset,huffTblNumber);
for (int i = huffTblOffset; i < huffTblOffset + huffTblNumber; ++i) {
size_t hdrLen = header().Offsets[i + 1] - header().Offsets[i];
char hdr[hdrLen];
myBase->seek((int) header().Offsets[i], true);
myBase->read(hdr, hdrLen);
loadDatpForDict(hdr, hdrLen);
//LOGI("yun %d ",datp.size());
}
myBase->seek(initialOffset + 14,
true); // myBase offset: ^ + 14
}
int PalmDocStream::loadDatpForDict(const char *data, long length) {
if (data[0] == 'D' && data[1] == 'A' && data[2] == 'T' && data[3] == 'P') {
int num = data[8];
int k = 10;
char longBytes[2] = {data[k + 0], data[k + 1]};
size_t count = PdbUtil::readUnsignedShort(longBytes);
for (int i = length-2; i >= 0; i = i - 2) {
char longBytes[2] = {data[i + 0], data[i + 1]};
length = i;
if(PdbUtil::readUnsignedShort(longBytes)!=0){
length += 2;
break;
}
}
size_t start = length - count * 2;
size_t len = count * 2 + start;
for (int i = start; i < len; i = i + 2) {
char longBytes[2] = {data[i + 0], data[i + 1]};
datp.push_back(PdbUtil::readUnsignedShort(longBytes));
}
return num;
}
return 0;
}
- Java 时间类-Calendar、Date、LocalDate/LocalTime
- Java消息队列--JMS概述
- Java FtpClient 实现文件上传服务
- Java消息队列--ActiveMq 实战
- Java消息队列-Spring整合ActiveMq
- 【知识】SAS数据分析完整笔记(3)
- 深入浅出Redis-Spring整合Redis
- Stream-快速入门Stream编程
- MySQL Regular Expression
- Jenkin-持续集成
- 4.3.4.7 Pattern Matching
- mysql left join、right join、inner join用法分析
- _CrtSetDbgFlag
- UNPv13:#第3章#套接字编程简介
- JavaScript 教程
- JavaScript 编辑工具
- JavaScript 与HTML
- JavaScript 与Java
- JavaScript 数据结构
- JavaScript 基本数据类型
- JavaScript 特殊数据类型
- JavaScript 运算符
- JavaScript typeof 运算符
- JavaScript 表达式
- JavaScript 类型转换
- JavaScript 基本语法
- JavaScript 注释
- Javascript 基本处理流程
- Javascript 选择结构
- Javascript if 语句
- Javascript if 语句的嵌套
- Javascript switch 语句
- Javascript 循环结构
- Javascript 循环结构实例
- Javascript 跳转语句
- Javascript 控制语句总结
- Javascript 函数介绍
- Javascript 函数的定义
- Javascript 函数调用
- Javascript 几种特殊的函数
- JavaScript 内置函数简介
- Javascript eval() 函数
- Javascript isFinite() 函数
- Javascript isNaN() 函数
- parseInt() 与 parseFloat()
- escape() 与 unescape()
- Javascript 字符串介绍
- Javascript length属性
- javascript 字符串函数
- Javascript 日期对象简介
- Javascript 日期对象用途
- Date 对象属性和方法
- Javascript 数组是什么
- Javascript 创建数组
- Javascript 数组赋值与取值
- Javascript 数组属性和方法
- 【第19期】HTTP请求头referer
- Sentinel流控日志与索引
- Next.js + TypeScript 搭建一个简易的博客系统
- 【redis】02-redis持久化存储以及对象存储
- Kubernetes 集群可视化监控之 Weave Scope 入门
- h5 与原生 app 交互的原理
- 怎么在Openresty中REST?
- 【redis】04-redis 根据监听key的失效事件实现订单超时关闭
- 搭建分布式任务调度平台
- 微信小程序根据线上版本 Source Map 文件定位错误代码
- 全解系列:内存泄漏定位工具LeakCanary!
- 【Java反射】触手可及
- 【Flutter 专题】100 何为 Flutter Widgets ?
- Python爬虫 爬取豆瓣电影Top250信息
- Python编程 基础练习(一)