The log file contents are a sequence of 32KB blocks. The only exception is that the tail of the file may contain a partial block.
Each block consists of a sequence of records:
1 2 3 4 5 6
block := record* trailer? record := checksum: uint32 // crc32c of type and data[] ; little-endian length: uint16 // little-endian type: uint8 // One of FULL, FIRST, MIDDLE, LAST data: uint8[length]
总结一下:
日志文件是可能的一个 header 加上若干个 block
每个 block 结构这个有点像正则表达式,包含 * 个 record 和可能存在的 trailer
每个 record 包含 crc32, little-endian 的 length, 一个类型和具体的 byte 数据
A record never starts within the last six bytes of a block (since it won’t fit). Any leftover bytes here form the trailer, which must consist entirely of zero bytes and must be skipped by readers.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. // // Log format information shared by reader and writer. // See ../doc/log_format.md for more detail.
// Header is checksum (4 bytes), length (2 bytes), type (1 byte). staticconstint kHeaderSize = 4 + 2 + 1;
} // namespace log } // namespace leveldb
#endif// STORAGE_LEVELDB_DB_LOG_FORMAT_H_
kBlockSize 是 32k, 即32 * 1024 = 32768
RecordType 是个 enum, 有下列几种类型。enum 在 C++ 默认是 int, 不过似乎它就用了一位
kFullType 全记录
kFirstTypekMiddleTypekLastType: 截断的记录的 first, middle..., last
kZeroType 不了解
kHeaderSize 包含 checksum, length, type。长度如前文所述。
关于 first middle last 还可以参考文档:
Example: consider a sequence of user records:
1 2 3
A: length 1000 B: length 97270 C: length 8000
A will be stored as a FULL record in the first block.
B will be split into three fragments: first fragment occupies the rest of the first block, second fragment occupies the entirety of the second block, and the third fragment occupies a prefix of the third block. This will leave six bytes free in the third block, which will be left empty as the trailer.
C will be stored as a FULL record in the fourth block.
classWriter { public: // Create a writer that will append data to "*dest". // "*dest" must be initially empty. // "*dest" must remain live while this Writer is in use. explicitWriter(WritableFile* dest);
// Create a writer that will append data to "*dest". // "*dest" must have initial length "dest_length". // "*dest" must remain live while this Writer is in use. Writer(WritableFile* dest, uint64_t dest_length);
private: Status EmitPhysicalRecord(RecordType type, constchar* ptr, size_t length);
WritableFile* dest_; int block_offset_; // Current offset in block
// crc32c values for all supported record types. These are // pre-computed to reduce the overhead of computing the crc of the // record type stored in the header. uint32_t type_crc_[kMaxRecordType + 1]; };
Status Writer::AddRecord(const Slice& slice){ // left is for slice. constchar* ptr = slice.data(); size_t left = slice.size();
// Fragment the record if necessary and emit it. Note that if slice // is empty, we still want to iterate once to emit a single // zero-length record Status s; bool begin = true; do { constint leftover = kBlockSize - block_offset_; assert(leftover >= 0); // < 7 byte, just padding it. if (leftover < kHeaderSize) { // Switch to a new block if (leftover > 0) { // Fill the trailer (literal below relies on kHeaderSize being 7) static_assert(kHeaderSize == 7, ""); dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); } // to a new block. block_offset_ = 0; }
// Invariant: we never leave < kHeaderSize bytes in a block. assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
RecordType type; // end 表示在这个 block 内能否写完 constbool end = (left == fragment_length); if (begin && end) { type = kFullType; } elseif (begin) { type = kFirstType; } elseif (end) { type = kLastType; } else { type = kMiddleType; }
s = EmitPhysicalRecord(type, ptr, fragment_length); ptr += fragment_length; left -= fragment_length; begin = false; } while (s.ok() && left > 0); return s; }
Status Writer::EmitPhysicalRecord(RecordType t, constchar* ptr, size_t length){ assert(length <= 0xffff); // Must fit in two bytes assert(block_offset_ + kHeaderSize + length <= kBlockSize);
// Format the header char buf[kHeaderSize]; buf[4] = static_cast<char>(length & 0xff); buf[5] = static_cast<char>(length >> 8); buf[6] = static_cast<char>(t);
// Compute the crc of the record type and the payload. uint32_t crc = crc32c::Extend(type_crc_[t], ptr, length); crc = crc32c::Mask(crc); // Adjust for storage EncodeFixed32(buf, crc);
// Write the header and the payload Status s = dest_->Append(Slice(buf, kHeaderSize)); if (s.ok()) { s = dest_->Append(Slice(ptr, length)); if (s.ok()) { s = dest_->Flush(); } } block_offset_ += kHeaderSize + length; return s; }
// Some corruption was detected. "size" is the approximate number // of bytes dropped due to the corruption. virtualvoidCorruption(size_t bytes, const Status& status)= 0; };
// Create a reader that will return log records from "*file". // "*file" must remain live while this Reader is in use. // // If "reporter" is non-null, it is notified whenever some data is // dropped due to a detected corruption. "*reporter" must remain // live while this Reader is in use. // // If "checksum" is true, verify checksums if available. // // The Reader will start reading at the first record located at physical // position >= initial_offset within the file. Reader(SequentialFile* file, Reporter* reporter, bool checksum, uint64_t initial_offset);
// Read the next record into *record. Returns true if read // successfully, false if we hit end of the input. May use // "*scratch" as temporary storage. The contents filled in *record // will only be valid until the next mutating operation on this // reader or the next mutation to *scratch. boolReadRecord(Slice* record, std::string* scratch);
// Returns the physical offset of the last record returned by ReadRecord. // // Undefined before the first call to ReadRecord. uint64_tLastRecordOffset();
private: // Extend record types with the following special values enum { kEof = kMaxRecordType + 1, // Returned whenever we find an invalid physical record. // Currently there are three situations in which this happens: // * The record has an invalid CRC (ReadPhysicalRecord reports a drop) // * The record is a 0-length record (No drop is reported) // * The record is below constructor's initial_offset (No drop is reported) kBadRecord = kMaxRecordType + 2 };
// Skips all blocks that are completely before "initial_offset_". // // Returns true on success. Handles reporting. boolSkipToInitialBlock();
// 返回 kMaxRecordType 定义的 type 或者 eof // Return type, or one of the preceding special values unsignedintReadPhysicalRecord(Slice* result);
// Reports dropped bytes to the reporter. // buffer_ must be updated to remove the dropped bytes prior to invocation. voidReportCorruption(uint64_t bytes, constchar* reason); voidReportDrop(uint64_t bytes, const Status& reason);
// 文件中 last_record 读的 offset // Offset of the last record returned by ReadRecord. uint64_t last_record_offset_;
// 维护的 buffer 的 offset // Offset of the first location past the end of buffer_. uint64_t end_of_buffer_offset_;
// The Reader will start reading at the first record located at physical // position >= initial_offset within the file. // Offset at which to start looking for the first record to return uint64_tconst initial_offset_;
// True if we are resynchronizing after a seek (initial_offset_ > 0). In // particular, a run of kMiddleType and kLastType records can be silently // skipped in this mode // 是否需要重新同步,通常因为 initial_offset_ seek 之后导致 bool resyncing_; };