なんかできた。
圧縮プログラムenc、伸張プログラムdec。
% cc enc.c -o enc % cc dec.c -o dec % cat hoge abc abbc abc cde cde cdeeacd cabe abe cafe age fage aaaabbbbcccc abc abc abcc abc fbcd fbcd fcbd ddcb dcb % ./enc hoge % ll hoge* -rw-r--r-- 1 sun31048 J03A 106 Nov 19 11:01 hoge -rw-r--r-- 1 sun31048 J03A 91 Nov 19 11:02 hoge.huf % rm -f hoge % ./dec hoge.huf % ll hoge* -rw-r--r-- 1 sun31048 J03A 106 Nov 19 11:03 hoge -rw-r--r-- 1 sun31048 J03A 91 Nov 19 11:02 hoge.huf % cat hoge abc abbc abc cde cde cdeeacd cabe abe cafe age fage aaaabbbbcccc abc abc abcc abc fbcd fbcd fcbd ddcb dcb
というわけで、おーなんかできてるくせー。辞書部分がでかくなるので、あんまり縮みませんね。
% ll jpn.txt* -rw-r--r-- 1 sun31048 J03A 231295 Nov 19 11:06 jpn.txt -rw-r--r-- 1 sun31048 J03A 149476 Nov 19 11:06 jpn.txt.huf
日本語テキストだったらこの程度になった。辞書をどうやって格納するかって難しいことだなあ。めんどくさかったので、出現頻度表をそのままぶちこんだ。以下比較。
% compress -c jpn.txt >jpn.txt.Z % gzip -c jpn.txt >jpn.txt.gz % bzip2 -c jpn.txt >jpn.txt.bz2 % ll jpn.txt.* -rw-r--r-- 1 sun31048 J03A 117761 Nov 19 11:19 jpn.txt.Z -rw-r--r-- 1 sun31048 J03A 87057 Nov 19 11:19 jpn.txt.bz2 -rw-r--r-- 1 sun31048 J03A 104777 Nov 19 11:19 jpn.txt.gz -rw-r--r-- 1 sun31048 J03A 149476 Nov 19 11:06 jpn.txt.huf
まあ当然だけど負けまくり。
なんかビット演算に慣れればそんなに難しくなかったのかもしれない。
以下ソースとか。
気紛れでファイル名関数名変数名は変えまくりですが。
圧縮プログラムenc.c
#include <stdio.h> #include <string.h> #define HCODEMAX 256 typedef struct{ unsigned char c; int count; } item; struct node{ item data; struct node *lbranch, *rbranch; }; struct node** isort_count(struct node *ctable[], int length){ int i, j; struct node *tmp; for(i=1;i<length;++i){ tmp = ctable[i]; for(j=i;j>0 && ctable[j-1]->data.count>tmp->data.count;--j) ctable[j] = ctable[j-1]; ctable[j] = tmp; } return ctable; } struct node* mknode(struct node *lbranch, struct node *rbranch){ struct node *n; n = (struct node *)malloc(sizeof(struct node)); n->data.count = 0; n->lbranch = lbranch; n->rbranch = rbranch; return n; } struct node* mktree(struct node *ctable[], int length){ struct node *root, *n; if(length == 1) return ctable[0]; n = mknode(ctable[0], ctable[1]); n->data.count = ctable[0]->data.count + ctable[1]->data.count; ctable[1] = n; isort_count(&ctable[1], length-1); root = mktree(&ctable[1], length-1); return root; } int print_ctable(FILE *fp, struct node *ctable[]){ int i; for(i=0;i<256;++i){ if(ctable[i]->data.count > 0){ fwrite(&(ctable[i]->data.c), 1, 1, fp); fwrite(&(ctable[i]->data.count), 4, 1, fp); } } } int mkhtable(char *htable[], struct node *root, char *precode){ char lcode[HCODEMAX], rcode[HCODEMAX]; strcpy(lcode, precode); strcpy(rcode, precode); if(root->lbranch != NULL) mkhtable(htable, root->lbranch, strcat(lcode, "0")); if(root->rbranch != NULL) mkhtable(htable, root->rbranch, strcat(rcode, "1")); if(root->lbranch == NULL && root->rbranch == NULL){ htable[root->data.c] = (char *)malloc(sizeof(char)*HCODEMAX); strcpy(htable[root->data.c], precode); } return 1; } int putbits(char *bitstream, FILE *fp){ static int length = 0; static unsigned char bits = 0; if(bitstream == NULL){ bits = bits << 8 - length; fwrite(&bits, 1, 1, fp); fwrite(&length, 1, 1, fp); } else { while(*bitstream != '\0'){ bits = (bits << 1) | (*bitstream - '0'); ++length; if(length == 8){ fwrite(&bits, 1, 1, fp); length = 0; bits = 0; } ++bitstream; } } return 1; } int write_archive(FILE *arch, FILE *plain, char *htable[]){ int c; while((c = fgetc(plain)) != EOF) putbits(htable[c], arch); putbits(NULL, arch); return 1; } int encode(char *plainname){ char archname[256]; int i; FILE *plain, *arch; struct node *ctable[256], *root; char *htable[256]; strcpy(archname, plainname); strcat(archname, ".huf"); if((plain = fopen(plainname, "r")) == NULL){ perror("cannot open plain"); return -1; } if((arch = fopen(archname, "w")) == NULL){ perror("cannot open arch"); return -1; } for(i=0;i<256;++i){ ctable[i] = mknode(NULL, NULL); ctable[i]->data.count = 0; ctable[i]->data.c = i; } while((i=fgetc(plain)) != EOF) ++ctable[i]->data.count; print_ctable(arch, ctable); isort_count(ctable, 256); for(i=0;ctable[i]->data.count == 0;++i); root = mktree(&ctable[i], 256-i); for(i=0;i<256;++i) htable[i] = NULL; mkhtable(htable, root, ""); fseek(plain, 0, SEEK_SET); i = 0; fwrite(&i, 1, 1, arch); fwrite(&i, 4, 1, arch); write_archive(arch, plain, htable); fclose(plain); fclose(arch); return 1; } int main(int argc, char *argv[]){ if(argc>1) if(!encode(argv[argc-1])) puts("cannot encode"); return 0; }
dec.c
#include <stdio.h> #include <string.h> #define HCODEMAX 256 typedef struct{ unsigned char c; int count; } item; struct node{ item data; struct node *lbranch, *rbranch; }; struct node** isort_count(struct node *ctable[], int length){ int i, j; struct node *tmp; for(i=1;i<length;++i){ tmp = ctable[i]; for(j=i;j>0 && ctable[j-1]->data.count>tmp->data.count;--j) ctable[j] = ctable[j-1]; ctable[j] = tmp; } return ctable; } struct node* mknode(struct node *lbranch, struct node *rbranch){ struct node *n; n = (struct node *)malloc(sizeof(struct node)); n->data.count = 0; n->lbranch = lbranch; n->rbranch = rbranch; return n; } struct node* mktree(struct node *ctable[], int length){ struct node *root, *n; if(length == 1) return ctable[0]; n = mknode(ctable[0], ctable[1]); n->data.count = ctable[0]->data.count + ctable[1]->data.count; ctable[1] = n; isort_count(&ctable[1], length-1); root = mktree(&ctable[1], length-1); return root; } char getbits(int data[], struct node *root, FILE *plain){ static struct node *n = NULL; int i, length; length = 0; if(n == NULL) n = root; for(i=128;i>0;i=i>>1){ if(data[2] == EOF && data[1] == length) return 1; if((i & data[0]) == 0){ n = n->lbranch; } else { n = n->rbranch; } if(n->lbranch == NULL && n->rbranch == NULL){ fputc(n->data.c, plain); n = root; } ++length; } return 1; } int read_archive(FILE *arch, FILE *plain, struct node *root){ int i, data[3]; data[0] = fgetc(arch); data[1] = fgetc(arch); while((data[2] = fgetc(arch)) != EOF){ getbits(data, root, plain); data[0] = data[1]; data[1] = data[2]; } getbits(data, root, plain); return 1; } int get_ctable(struct node *ctable[], FILE *arch){ int i, c; unsigned int count; i = 0; while(1){ fread(&c, 1, 1, arch); fread(&count, 4, 1, arch); if(c==0 && count==0) return i; ctable[i] = mknode(NULL, NULL); ctable[i]->data.c = c; ctable[i]->data.count = count; ++i; } return 0; } int decode(char *archname){ char plainname[256], *cp; int i, ctable_len; FILE *plain, *arch; struct node *ctable[256], *root; strcpy(plainname, archname); cp = strstr(plainname, ".huf"); *cp = '\0'; if((arch = fopen(archname, "r")) == NULL){ perror("cannot open arch"); return -1; } if((plain = fopen(plainname, "w")) == NULL){ perror("cannot open plain"); return -1; } ctable_len = get_ctable(ctable, arch); isort_count(ctable, ctable_len); root = mktree(ctable, ctable_len); read_archive(arch, plain, root); fclose(plain); fclose(arch); return 1; } int main(int argc, char *argv[]){ if(argc>1) if(!decode(argv[argc-1])) puts("cannot decode"); return 0; }
ちょっとしたことですぐにゲロ吐いて死ぬプログラムです。完成度としては「条件さえ整えれば、ちゃんと動作することもある」といったところ。