带你写一个Linux 下的打包软件 Tar

相信你对 linux 的 .tar.gz 有点熟悉，这就是先 tar 打包(.tar 后缀)，再对此 tar 文件用 gzip 压缩(.tar.gz)的后缀名。

值得注意的是， tar 不是压缩软件，它只做把一堆文件/文件夹打包到一个文件(tar 文件)里的事情，而文件联系，文件权限，相对的路径等都会给你保存好。

一开始设计是 tar 跟 gzip 只做一件事情，各司其事，后来发现太麻烦了，于是就把压缩功能整合到 tar 里了。

复制

- Create a gzipped archive:     tar czf target.tar.gz file1 file2 file3

最近学习 OS 时写了一个类似 tar 的项目，那么今天就趁热打铁简单说一下如何写一个打包软件，这个软件会将重复的文件内容通过 md5 比较，复用旧的内容。

基本单位 block

block 可以理解为文件系统的最小单位，分别有以下类型：

directory block，文件夹 block，存储文件夹 meta 信息;
file block，文件 block，存储文件 meta 信息;
data block，只用来存文件内容;

Directory block，注意的是 entry 里要有 fileindex 来存储重复文件的 name 的下标。

同时，给项目一个 root dir。

复制

typedef struct {     char        name[SIFS_MAX_NAME_LENGTH]; // name of the directory     time_t        modtime;    // time last modified <- time()      uint32_t        nentries;// 文件夹内的文件/文件夹数量     struct {         SIFS_BLOCKID    blockID;    // subdirectory 或者 file 的 blockID         uint32_t    fileindex;    // 重复文件的不同名字     } entries[SIFS_MAX_ENTRIES]; } SIFS_DIRBLOCK;

文件 Block，length 就是有多少 bytes 的文件内容，之后用来算有多少个 data block，firstblockID 记录第一个数据 block 的 id，nfiles 记录有多少重复内容的文件数量了，filenames 就是重复此文件 block 的文件内容的文件名字。

复制

typedef struct {     time_t        modtime;    // time first file added <- time()     size_t        length;        // length of files' contents in bytes      unsigned char    md5[MD5_BYTELEN];//the MD5 cryptographic digest (a summary) of the files' contents     SIFS_BLOCKID    firstblockID;// the block number (blockID) of the files' first data-block      uint32_t        nfiles;        // n files with identical contents     char        filenames[SIFS_MAX_ENTRIES][SIFS_MAX_NAME_LENGTH];// an array of each same file's name and its modification time. } SIFS_FILEBLOCK;

bitmaps数组，记录了每个 block 的类型，有：文件、文件夹以及data block 三种类型。

通用函数

就让大家看看关键函数好了：

读 tar 后的文件的 meta 头，记录了 block 的大小( blocksize) 以及多少个 blocks。

复制

void read_vol_header(FILE *vol, SIFS_VOLUME_HEADER *header) {     fread(header, sizeof(SIFS_VOLUME_HEADER), 1, vol);     printf("header->blocksize %zu, header->nblocks %u\n", header->blocksize , header->nblocks); }

bitmap，每次操作 tar 文件都要读的。

复制

void read_bitmap(FILE *vol, SIFS_BIT *bitmap, int nblocks) {     int size = nblocks * sizeof(SIFS_BIT);     fread(bitmap, size, 1, vol); }

root_block 同理，读和写啥东西都要从 root block、root dir 出发。

复制

void read_root_block(FILE *vol, SIFS_DIRBLOCK *dirblock){     fread(dirblock, sizeof(SIFS_DIRBLOCK), 1, vol);     printf("read_root_block finish, dirblock.name: %s, dirblock.entrieds: %d, dirblock.modtime %ld\n", dirblock->name, dirblock->nentries,dirblock->modtime); }

路径嘛，你懂的，./sifs_put volumn ~/res.txt /dirB/subdirB/subsubdir/newfileB，要读的内容可以靠 read 函数解决，但是写到 tar 文件里的就要手动解析递归查路径了。

复制

void read_route_names(char* pathname, char** route_names, int *route_cnt) {     char *dir;     char *pathname_to_split = copyStr(pathname);     strcpy(pathname_to_split, pathname);     while ((dir = strsep(&pathname_to_split, "/")) != NULL) {         route_names[*route_cnt] = copyStr(dir);         (*route_cnt)++;     } }

以上几乎是 mkdir，rmdir，writefile，readfile，putfile 等等操作都要做的。

实现

然后，应该举一个 readfile 的例子就可以做代表了。

复制

int recursive_dirinfo(SIFS_DIRBLOCK *cur_dir_block, char **route_names, int route_name_p, int route_cnt);

实现：

复制

int recursive_dirinfo(SIFS_DIRBLOCK *cur_dir_block, char **route_names, int route_name_p, int route_cnt) {     for(int i=0; i<cur_dir_block->nentries ; i++) {         int blockid = cur_dir_block->entries[i].blockID;         if(bitmap[blockid]==SIFS_DIR) {             SIFS_DIRBLOCK dirblock;             int start = sizeof(SIFS_VOLUME_HEADER) + header.nblocks*sizeof(SIFS_BIT);             read_dir_block(vol, &dirblock, blockid * blocksize, start);             if(strcmp(dirblock.name, route_names[route_name_p]) == 0) {                 if(route_name_p+2 == route_cnt) {                     return do_read_file(cur_dir_block, route_names[route_name_p+1], blockid);                 }                 return recursive_dirinfo(&dirblock, route_names, route_name_p+1, route_cnt);             }         }     }     return 1; }

以``./sifs_put volumn ~/res.txt /dirB/subdirB/subsubdir/newfileB 为例子，如果递归找到 subsubdir`这个文件夹 block，进行相应操作：

写文件就往 bitmap 一直找没有用过的 block，够写文件就写进去，文件夹更新一下信息。
读文件就是根据此文件夹 block，找里面的 newfileB

复制

int do_read_file(SIFS_DIRBLOCK *parent_dir, char *filename,  int parent_dir_block) {     printf("do_find_file_info, filename %s\n", filename);     for(int i=1; i<header.nblocks ; i++) {         SIFS_FILEBLOCK fileblock;         if(bitmap[i]==SIFS_FILE) {             int start = sizeof(SIFS_VOLUME_HEADER) + header.nblocks*sizeof(SIFS_BIT);             read_file_block(vol, &fileblock, i * blocksize, start);             *nbytes = fileblock.length;             int need_data_blocks = *nbytes / header.blocksize;             if(strcmp(fileblock.filenames[0],  filename) == 0) {                 for(int d_block_id = fileblock.firstblockID; d_block_id - i -1 < need_data_blocks; d_block_id++) {                     read_data_block(vol, (char*)(*data)+(d_block_id - i -1), blocksize, d_block_id * header.blocksize, start);                 }                 return 0;             }          }     }     return 1; }

而真实的 tar 自然更复杂，还要记录用户权限、用户、group文件等等：

复制

struct posix_header {                       /* byte offset */   char name[100];       /*   0 */   文件名   char mode[8];         /* 100 */   用户权限   char uid[8];          /* 108 */   user id   char gid[8];          /* 116 */   group id   char size[12];        /* 124 */   文件大小   char mtime[12];       /* 136 */   修改时间   char chksum[8];       /* 148 */   校验值   char typeflag;        /* 156 */   文件类型标志   char linkname[100];   /* 157 */   符号链接指向   char magic[6];        /* 257 */      char version[2];      /* 263 */   char uname[32];       /* 265 */   user name   char gname[32];       /* 297 */   group name   char devmajor[8];     /* 329 */   设备文件 major   char devminor[8];     /* 337 */   设备文件 minor   char prefix[155];     /* 345 */                         /* 500 */ };  文件类型标志定义，包含了所有 Unix 系统中的文件类型  #define REGTYPE  '0'            /* regular file */ #define LNKTYPE  '1'            /* link */ #define SYMTYPE  '2'            /* reserved */ #define CHRTYPE  '3'            /* character special */ #define BLKTYPE  '4'            /* block special */ #define DIRTYPE  '5'            /* directory */ #define FIFOTYPE '6'            /* FIFO special */ #define CONTTYPE '7'            /* reserved */

概览如此，写起来其实有点烦 - = -，有兴趣的读者可以写写。

喜欢已喜欢

305 2021-10-03 22:06:43 Linux Tar 软件

自动解锁Linux上的加密磁盘

防火墙，下一个是澳洲？

基本单位 block

通用函数

实现

您正在使用低版本浏览器，为了获得更良好的体验，建议您升级浏览器，为您推荐：