| // SPDX-License-Identifier: GPL-2.0-or-later |
| |
| /* P9 gunzip sample code for demonstrating the P9 NX hardware |
| * interface. Not intended for productive uses or for performance or |
| * compression ratio measurements. Note also that /dev/crypto/gzip, |
| * VAS and skiboot support are required |
| * |
| * Copyright 2020 IBM Corp. |
| * |
| * Author: Bulent Abali <abali@us.ibm.com> |
| * |
| * https://github.com/libnxz/power-gzip for zlib api and other utils |
| * Definitions of acronyms used here. See |
| * P9 NX Gzip Accelerator User's Manual for details: |
| * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf |
| * |
| * adler/crc: 32 bit checksums appended to stream tail |
| * ce: completion extension |
| * cpb: coprocessor parameter block (metadata) |
| * crb: coprocessor request block (command) |
| * csb: coprocessor status block (status) |
| * dht: dynamic huffman table |
| * dde: data descriptor element (address, length) |
| * ddl: list of ddes |
| * dh/fh: dynamic and fixed huffman types |
| * fc: coprocessor function code |
| * histlen: history/dictionary length |
| * history: sliding window of up to 32KB of data |
| * lzcount: Deflate LZ symbol counts |
| * rembytecnt: remaining byte count |
| * sfbt: source final block type; last block's type during decomp |
| * spbc: source processed byte count |
| * subc: source unprocessed bit count |
| * tebc: target ending bit count; valid bits in the last byte |
| * tpbc: target processed byte count |
| * vas: virtual accelerator switch; the user mode interface |
| */ |
| |
| #define _ISOC11_SOURCE // For aligned_alloc() |
| #define _DEFAULT_SOURCE // For endian.h |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <unistd.h> |
| #include <stdint.h> |
| #include <sys/types.h> |
| #include <sys/stat.h> |
| #include <sys/time.h> |
| #include <sys/fcntl.h> |
| #include <sys/mman.h> |
| #include <endian.h> |
| #include <bits/endian.h> |
| #include <sys/ioctl.h> |
| #include <assert.h> |
| #include <errno.h> |
| #include <signal.h> |
| #include "nxu.h" |
| #include "nx.h" |
| #include "crb.h" |
| |
| int nx_dbg; |
| FILE *nx_gzip_log; |
| |
| #define NX_MIN(X, Y) (((X) < (Y))?(X):(Y)) |
| #define NX_MAX(X, Y) (((X) > (Y))?(X):(Y)) |
| |
| #define GETINPC(X) fgetc(X) |
| #define FNAME_MAX 1024 |
| |
| /* fifo queue management */ |
| #define fifo_used_bytes(used) (used) |
| #define fifo_free_bytes(used, len) ((len)-(used)) |
| /* amount of free bytes in the first and last parts */ |
| #define fifo_free_first_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ |
| ? (len)-((cur)+(used)) : 0) |
| #define fifo_free_last_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ |
| ? (cur) : (len)-(used)) |
| /* amount of used bytes in the first and last parts */ |
| #define fifo_used_first_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ |
| ? (used) : (len)-(cur)) |
| #define fifo_used_last_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ |
| ? 0 : ((used)+(cur))-(len)) |
| /* first and last free parts start here */ |
| #define fifo_free_first_offset(cur, used) ((cur)+(used)) |
| #define fifo_free_last_offset(cur, used, len) \ |
| fifo_used_last_bytes(cur, used, len) |
| /* first and last used parts start here */ |
| #define fifo_used_first_offset(cur) (cur) |
| #define fifo_used_last_offset(cur) (0) |
| |
| const int fifo_in_len = 1<<24; |
| const int fifo_out_len = 1<<24; |
| const int page_sz = 1<<16; |
| const int line_sz = 1<<7; |
| const int window_max = 1<<15; |
| |
| /* |
| * Adds an (address, len) pair to the list of ddes (ddl) and updates |
| * the base dde. ddl[0] is the only dde in a direct dde which |
| * contains a single (addr,len) pair. For more pairs, ddl[0] becomes |
| * the indirect (base) dde that points to a list of direct ddes. |
| * See Section 6.4 of the NX-gzip user manual for DDE description. |
| * Addr=NULL, len=0 clears the ddl[0]. Returns the total number of |
| * bytes in ddl. Caller is responsible for allocting the array of |
| * nx_dde_t *ddl. If N addresses are required in the scatter-gather |
| * list, the ddl array must have N+1 entries minimum. |
| */ |
| static inline uint32_t nx_append_dde(struct nx_dde_t *ddl, void *addr, |
| uint32_t len) |
| { |
| uint32_t ddecnt; |
| uint32_t bytes; |
| |
| if (addr == NULL && len == 0) { |
| clearp_dde(ddl); |
| return 0; |
| } |
| |
| NXPRT(fprintf(stderr, "%d: %s addr %p len %x\n", __LINE__, addr, |
| __func__, len)); |
| |
| /* Number of ddes in the dde list ; == 0 when it is a direct dde */ |
| ddecnt = getpnn(ddl, dde_count); |
| bytes = getp32(ddl, ddebc); |
| |
| if (ddecnt == 0 && bytes == 0) { |
| /* First dde is unused; make it a direct dde */ |
| bytes = len; |
| putp32(ddl, ddebc, bytes); |
| putp64(ddl, ddead, (uint64_t) addr); |
| } else if (ddecnt == 0) { |
| /* Converting direct to indirect dde |
| * ddl[0] becomes head dde of ddl |
| * copy direct to indirect first. |
| */ |
| ddl[1] = ddl[0]; |
| |
| /* Add the new dde next */ |
| clear_dde(ddl[2]); |
| put32(ddl[2], ddebc, len); |
| put64(ddl[2], ddead, (uint64_t) addr); |
| |
| /* Ddl head points to 2 direct ddes */ |
| ddecnt = 2; |
| putpnn(ddl, dde_count, ddecnt); |
| bytes = bytes + len; |
| putp32(ddl, ddebc, bytes); |
| /* Pointer to the first direct dde */ |
| putp64(ddl, ddead, (uint64_t) &ddl[1]); |
| } else { |
| /* Append a dde to an existing indirect ddl */ |
| ++ddecnt; |
| clear_dde(ddl[ddecnt]); |
| put64(ddl[ddecnt], ddead, (uint64_t) addr); |
| put32(ddl[ddecnt], ddebc, len); |
| |
| putpnn(ddl, dde_count, ddecnt); |
| bytes = bytes + len; |
| putp32(ddl, ddebc, bytes); /* byte sum of all dde */ |
| } |
| return bytes; |
| } |
| |
| /* |
| * Touch specified number of pages represented in number bytes |
| * beginning from the first buffer in a dde list. |
| * Do not touch the pages past buf_sz-th byte's page. |
| * |
| * Set buf_sz = 0 to touch all pages described by the ddep. |
| */ |
| static int nx_touch_pages_dde(struct nx_dde_t *ddep, long buf_sz, long page_sz, |
| int wr) |
| { |
| uint32_t indirect_count; |
| uint32_t buf_len; |
| long total; |
| uint64_t buf_addr; |
| struct nx_dde_t *dde_list; |
| int i; |
| |
| assert(!!ddep); |
| |
| indirect_count = getpnn(ddep, dde_count); |
| |
| NXPRT(fprintf(stderr, "%s dde_count %d request len ", __func__, |
| indirect_count)); |
| NXPRT(fprintf(stderr, "0x%lx\n", buf_sz)); |
| |
| if (indirect_count == 0) { |
| /* Direct dde */ |
| buf_len = getp32(ddep, ddebc); |
| buf_addr = getp64(ddep, ddead); |
| |
| NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n", |
| buf_len, (void *)buf_addr)); |
| |
| if (buf_sz == 0) |
| nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); |
| else |
| nxu_touch_pages((void *)buf_addr, NX_MIN(buf_len, |
| buf_sz), page_sz, wr); |
| |
| return ERR_NX_OK; |
| } |
| |
| /* Indirect dde */ |
| if (indirect_count > MAX_DDE_COUNT) |
| return ERR_NX_EXCESSIVE_DDE; |
| |
| /* First address of the list */ |
| dde_list = (struct nx_dde_t *) getp64(ddep, ddead); |
| |
| if (buf_sz == 0) |
| buf_sz = getp32(ddep, ddebc); |
| |
| total = 0; |
| for (i = 0; i < indirect_count; i++) { |
| buf_len = get32(dde_list[i], ddebc); |
| buf_addr = get64(dde_list[i], ddead); |
| total += buf_len; |
| |
| NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total ", |
| buf_len, (void *)buf_addr)); |
| NXPRT(fprintf(stderr, "0x%lx\n", total)); |
| |
| /* Touching fewer pages than encoded in the ddebc */ |
| if (total > buf_sz) { |
| buf_len = NX_MIN(buf_len, total - buf_sz); |
| nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); |
| NXPRT(fprintf(stderr, "touch loop break len 0x%x ", |
| buf_len)); |
| NXPRT(fprintf(stderr, "ddead %p\n", (void *)buf_addr)); |
| break; |
| } |
| nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); |
| } |
| return ERR_NX_OK; |
| } |
| |
| /* |
| * Src and dst buffers are supplied in scatter gather lists. |
| * NX function code and other parameters supplied in cmdp. |
| */ |
| static int nx_submit_job(struct nx_dde_t *src, struct nx_dde_t *dst, |
| struct nx_gzip_crb_cpb_t *cmdp, void *handle) |
| { |
| uint64_t csbaddr; |
| |
| memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb)); |
| |
| cmdp->crb.source_dde = *src; |
| cmdp->crb.target_dde = *dst; |
| |
| /* Status, output byte count in tpbc */ |
| csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask; |
| put64(cmdp->crb, csb_address, csbaddr); |
| |
| /* NX reports input bytes in spbc; cleared */ |
| cmdp->cpb.out_spbc_comp_wrap = 0; |
| cmdp->cpb.out_spbc_comp_with_count = 0; |
| cmdp->cpb.out_spbc_decomp = 0; |
| |
| /* Clear output */ |
| put32(cmdp->cpb, out_crc, INIT_CRC); |
| put32(cmdp->cpb, out_adler, INIT_ADLER); |
| |
| /* Submit the crb, the job descriptor, to the accelerator. */ |
| return nxu_submit_job(cmdp, handle); |
| } |
| |
| int decompress_file(int argc, char **argv, void *devhandle) |
| { |
| FILE *inpf = NULL; |
| FILE *outf = NULL; |
| |
| int c, expect, i, cc, rc = 0; |
| char gzfname[FNAME_MAX]; |
| |
| /* Queuing, file ops, byte counting */ |
| char *fifo_in, *fifo_out; |
| int used_in, cur_in, used_out, cur_out, read_sz, n; |
| int first_free, last_free, first_used, last_used; |
| int first_offset, last_offset; |
| int write_sz, free_space, source_sz; |
| int source_sz_estimate, target_sz_estimate; |
| uint64_t last_comp_ratio = 0; /* 1000 max */ |
| uint64_t total_out = 0; |
| int is_final, is_eof; |
| |
| /* nx hardware */ |
| int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0; |
| int history_len = 0; |
| struct nx_gzip_crb_cpb_t cmd, *cmdp; |
| struct nx_dde_t *ddl_in; |
| struct nx_dde_t dde_in[6] __aligned(128); |
| struct nx_dde_t *ddl_out; |
| struct nx_dde_t dde_out[6] __aligned(128); |
| int pgfault_retries; |
| |
| /* when using mmap'ed files */ |
| off_t input_file_offset; |
| |
| if (argc > 2) { |
| fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]); |
| fprintf(stderr, " writes to stdout or <fname>.nx.gunzip\n"); |
| return -1; |
| } |
| |
| if (argc == 1) { |
| inpf = stdin; |
| outf = stdout; |
| } else if (argc == 2) { |
| char w[1024]; |
| char *wp; |
| |
| inpf = fopen(argv[1], "r"); |
| if (inpf == NULL) { |
| perror(argv[1]); |
| return -1; |
| } |
| |
| /* Make a new file name to write to. Ignoring '.gz' */ |
| wp = (NULL != (wp = strrchr(argv[1], '/'))) ? (wp+1) : argv[1]; |
| strcpy(w, wp); |
| strcat(w, ".nx.gunzip"); |
| |
| outf = fopen(w, "w"); |
| if (outf == NULL) { |
| perror(w); |
| return -1; |
| } |
| } |
| |
| /* Decode the gzip header */ |
| c = GETINPC(inpf); expect = 0x1f; /* ID1 */ |
| if (c != expect) |
| goto err1; |
| |
| c = GETINPC(inpf); expect = 0x8b; /* ID2 */ |
| if (c != expect) |
| goto err1; |
| |
| c = GETINPC(inpf); expect = 0x08; /* CM */ |
| if (c != expect) |
| goto err1; |
| |
| int flg = GETINPC(inpf); /* FLG */ |
| |
| if (flg & 0xE0 || flg & 0x4 || flg == EOF) |
| goto err2; |
| |
| fprintf(stderr, "gzHeader FLG %x\n", flg); |
| |
| /* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this |
| * sample code. |
| */ |
| for (i = 0; i < 6; i++) { |
| char tmp[10]; |
| |
| tmp[i] = GETINPC(inpf); |
| if (tmp[i] == EOF) |
| goto err3; |
| fprintf(stderr, "%02x ", tmp[i]); |
| if (i == 5) |
| fprintf(stderr, "\n"); |
| } |
| fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n"); |
| |
| /* FNAME */ |
| if (flg & 0x8) { |
| int k = 0; |
| |
| do { |
| c = GETINPC(inpf); |
| if (c == EOF || k >= FNAME_MAX) |
| goto err3; |
| gzfname[k++] = c; |
| } while (c); |
| fprintf(stderr, "gzHeader FNAME: %s\n", gzfname); |
| } |
| |
| /* FHCRC */ |
| if (flg & 0x2) { |
| c = GETINPC(inpf); |
| if (c == EOF) |
| goto err3; |
| c = GETINPC(inpf); |
| if (c == EOF) |
| goto err3; |
| fprintf(stderr, "gzHeader FHCRC: ignored\n"); |
| } |
| |
| used_in = cur_in = used_out = cur_out = 0; |
| is_final = is_eof = 0; |
| |
| /* Allocate one page larger to prevent page faults due to NX |
| * overfetching. |
| * Either do this (char*)(uintptr_t)aligned_alloc or use |
| * -std=c11 flag to make the int-to-pointer warning go away. |
| */ |
| assert((fifo_in = (char *)(uintptr_t)aligned_alloc(line_sz, |
| fifo_in_len + page_sz)) != NULL); |
| assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz, |
| fifo_out_len + page_sz + line_sz)) != NULL); |
| /* Leave unused space due to history rounding rules */ |
| fifo_out = fifo_out + line_sz; |
| nxu_touch_pages(fifo_out, fifo_out_len, page_sz, 1); |
| |
| ddl_in = &dde_in[0]; |
| ddl_out = &dde_out[0]; |
| cmdp = &cmd; |
| memset(&cmdp->crb, 0, sizeof(cmdp->crb)); |
| |
| read_state: |
| |
| /* Read from .gz file */ |
| |
| NXPRT(fprintf(stderr, "read_state:\n")); |
| |
| if (is_eof != 0) |
| goto write_state; |
| |
| /* We read in to fifo_in in two steps: first: read in to from |
| * cur_in to the end of the buffer. last: if free space wrapped |
| * around, read from fifo_in offset 0 to offset cur_in. |
| */ |
| |
| /* Reset fifo head to reduce unnecessary wrap arounds */ |
| cur_in = (used_in == 0) ? 0 : cur_in; |
| |
| /* Free space total is reduced by a gap */ |
| free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len) |
| - line_sz); |
| |
| /* Free space may wrap around as first and last */ |
| first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len); |
| last_free = fifo_free_last_bytes(cur_in, used_in, fifo_in_len); |
| |
| /* Start offsets of the free memory */ |
| first_offset = fifo_free_first_offset(cur_in, used_in); |
| last_offset = fifo_free_last_offset(cur_in, used_in, fifo_in_len); |
| |
| /* Reduce read_sz because of the line_sz gap */ |
| read_sz = NX_MIN(free_space, first_free); |
| n = 0; |
| if (read_sz > 0) { |
| /* Read in to offset cur_in + used_in */ |
| n = fread(fifo_in + first_offset, 1, read_sz, inpf); |
| used_in = used_in + n; |
| free_space = free_space - n; |
| assert(n <= read_sz); |
| if (n != read_sz) { |
| /* Either EOF or error; exit the read loop */ |
| is_eof = 1; |
| goto write_state; |
| } |
| } |
| |
| /* If free space wrapped around */ |
| if (last_free > 0) { |
| /* Reduce read_sz because of the line_sz gap */ |
| read_sz = NX_MIN(free_space, last_free); |
| n = 0; |
| if (read_sz > 0) { |
| n = fread(fifo_in + last_offset, 1, read_sz, inpf); |
| used_in = used_in + n; /* Increase used space */ |
| free_space = free_space - n; /* Decrease free space */ |
| assert(n <= read_sz); |
| if (n != read_sz) { |
| /* Either EOF or error; exit the read loop */ |
| is_eof = 1; |
| goto write_state; |
| } |
| } |
| } |
| |
| /* At this point we have used_in bytes in fifo_in with the |
| * data head starting at cur_in and possibly wrapping around. |
| */ |
| |
| write_state: |
| |
| /* Write decompressed data to output file */ |
| |
| NXPRT(fprintf(stderr, "write_state:\n")); |
| |
| if (used_out == 0) |
| goto decomp_state; |
| |
| /* If fifo_out has data waiting, write it out to the file to |
| * make free target space for the accelerator used bytes in |
| * the first and last parts of fifo_out. |
| */ |
| |
| first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len); |
| last_used = fifo_used_last_bytes(cur_out, used_out, fifo_out_len); |
| |
| write_sz = first_used; |
| |
| n = 0; |
| if (write_sz > 0) { |
| n = fwrite(fifo_out + cur_out, 1, write_sz, outf); |
| used_out = used_out - n; |
| /* Move head of the fifo */ |
| cur_out = (cur_out + n) % fifo_out_len; |
| assert(n <= write_sz); |
| if (n != write_sz) { |
| fprintf(stderr, "error: write\n"); |
| rc = -1; |
| goto err5; |
| } |
| } |
| |
| if (last_used > 0) { /* If more data available in the last part */ |
| write_sz = last_used; /* Keep it here for later */ |
| n = 0; |
| if (write_sz > 0) { |
| n = fwrite(fifo_out, 1, write_sz, outf); |
| used_out = used_out - n; |
| cur_out = (cur_out + n) % fifo_out_len; |
| assert(n <= write_sz); |
| if (n != write_sz) { |
| fprintf(stderr, "error: write\n"); |
| rc = -1; |
| goto err5; |
| } |
| } |
| } |
| |
| decomp_state: |
| |
| /* NX decompresses input data */ |
| |
| NXPRT(fprintf(stderr, "decomp_state:\n")); |
| |
| if (is_final) |
| goto finish_state; |
| |
| /* Address/len lists */ |
| clearp_dde(ddl_in); |
| clearp_dde(ddl_out); |
| |
| /* FC, CRC, HistLen, Table 6-6 */ |
| if (resuming) { |
| /* Resuming a partially decompressed input. |
| * The key to resume is supplying the 32KB |
| * dictionary (history) to NX, which is basically |
| * the last 32KB of output produced. |
| */ |
| fc = GZIP_FC_DECOMPRESS_RESUME; |
| |
| cmdp->cpb.in_crc = cmdp->cpb.out_crc; |
| cmdp->cpb.in_adler = cmdp->cpb.out_adler; |
| |
| /* Round up the history size to quadword. Section 2.10 */ |
| history_len = (history_len + 15) / 16; |
| putnn(cmdp->cpb, in_histlen, history_len); |
| history_len = history_len * 16; /* bytes */ |
| |
| if (history_len > 0) { |
| /* Chain in the history buffer to the DDE list */ |
| if (cur_out >= history_len) { |
| nx_append_dde(ddl_in, fifo_out |
| + (cur_out - history_len), |
| history_len); |
| } else { |
| nx_append_dde(ddl_in, fifo_out |
| + ((fifo_out_len + cur_out) |
| - history_len), |
| history_len - cur_out); |
| /* Up to 32KB history wraps around fifo_out */ |
| nx_append_dde(ddl_in, fifo_out, cur_out); |
| } |
| |
| } |
| } else { |
| /* First decompress job */ |
| fc = GZIP_FC_DECOMPRESS; |
| |
| history_len = 0; |
| /* Writing 0 clears out subc as well */ |
| cmdp->cpb.in_histlen = 0; |
| total_out = 0; |
| |
| put32(cmdp->cpb, in_crc, INIT_CRC); |
| put32(cmdp->cpb, in_adler, INIT_ADLER); |
| put32(cmdp->cpb, out_crc, INIT_CRC); |
| put32(cmdp->cpb, out_adler, INIT_ADLER); |
| |
| /* Assuming 10% compression ratio initially; use the |
| * most recently measured compression ratio as a |
| * heuristic to estimate the input and output |
| * sizes. If we give too much input, the target buffer |
| * overflows and NX cycles are wasted, and then we |
| * must retry with smaller input size. 1000 is 100%. |
| */ |
| last_comp_ratio = 100UL; |
| } |
| cmdp->crb.gzip_fc = 0; |
| putnn(cmdp->crb, gzip_fc, fc); |
| |
| /* |
| * NX source buffers |
| */ |
| first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len); |
| last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len); |
| |
| if (first_used > 0) |
| nx_append_dde(ddl_in, fifo_in + cur_in, first_used); |
| |
| if (last_used > 0) |
| nx_append_dde(ddl_in, fifo_in, last_used); |
| |
| /* |
| * NX target buffers |
| */ |
| first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len); |
| last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len); |
| |
| /* Reduce output free space amount not to overwrite the history */ |
| int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len) |
| - (1<<16)); |
| |
| NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max, |
| target_max)); |
| |
| first_free = NX_MIN(target_max, first_free); |
| if (first_free > 0) { |
| first_offset = fifo_free_first_offset(cur_out, used_out); |
| nx_append_dde(ddl_out, fifo_out + first_offset, first_free); |
| } |
| |
| if (last_free > 0) { |
| last_free = NX_MIN(target_max - first_free, last_free); |
| if (last_free > 0) { |
| last_offset = fifo_free_last_offset(cur_out, used_out, |
| fifo_out_len); |
| nx_append_dde(ddl_out, fifo_out + last_offset, |
| last_free); |
| } |
| } |
| |
| /* Target buffer size is used to limit the source data size |
| * based on previous measurements of compression ratio. |
| */ |
| |
| /* source_sz includes history */ |
| source_sz = getp32(ddl_in, ddebc); |
| assert(source_sz > history_len); |
| source_sz = source_sz - history_len; |
| |
| /* Estimating how much source is needed to 3/4 fill a |
| * target_max size target buffer. If we overshoot, then NX |
| * must repeat the job with smaller input and we waste |
| * bandwidth. If we undershoot then we use more NX calls than |
| * necessary. |
| */ |
| |
| source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL) |
| / 4000; |
| |
| if (source_sz_estimate < source_sz) { |
| /* Target might be small, therefore limiting the |
| * source data. |
| */ |
| source_sz = source_sz_estimate; |
| target_sz_estimate = target_max; |
| } else { |
| /* Source file might be small, therefore limiting target |
| * touch pages to a smaller value to save processor cycles. |
| */ |
| target_sz_estimate = ((uint64_t)source_sz * 1000UL) |
| / (last_comp_ratio + 1); |
| target_sz_estimate = NX_MIN(2 * target_sz_estimate, |
| target_max); |
| } |
| |
| source_sz = source_sz + history_len; |
| |
| /* Some NX condition codes require submitting the NX job again. |
| * Kernel doesn't handle NX page faults. Expects user code to |
| * touch pages. |
| */ |
| pgfault_retries = NX_MAX_FAULTS; |
| |
| restart_nx: |
| |
| putp32(ddl_in, ddebc, source_sz); |
| |
| /* Fault in pages */ |
| nxu_touch_pages(cmdp, sizeof(struct nx_gzip_crb_cpb_t), page_sz, 1); |
| nx_touch_pages_dde(ddl_in, 0, page_sz, 0); |
| nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1); |
| |
| /* Send job to NX */ |
| cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle); |
| |
| switch (cc) { |
| |
| case ERR_NX_TRANSLATION: |
| |
| /* We touched the pages ahead of time. In the most common case |
| * we shouldn't be here. But may be some pages were paged out. |
| * Kernel should have placed the faulting address to fsaddr. |
| */ |
| NXPRT(fprintf(stderr, "ERR_NX_TRANSLATION %p\n", |
| (void *)cmdp->crb.csb.fsaddr)); |
| |
| if (pgfault_retries == NX_MAX_FAULTS) { |
| /* Try once with exact number of pages */ |
| --pgfault_retries; |
| goto restart_nx; |
| } else if (pgfault_retries > 0) { |
| /* If still faulting try fewer input pages |
| * assuming memory outage |
| */ |
| if (source_sz > page_sz) |
| source_sz = NX_MAX(source_sz / 2, page_sz); |
| --pgfault_retries; |
| goto restart_nx; |
| } else { |
| fprintf(stderr, "cannot make progress; too many "); |
| fprintf(stderr, "page fault retries cc= %d\n", cc); |
| rc = -1; |
| goto err5; |
| } |
| |
| case ERR_NX_DATA_LENGTH: |
| |
| NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; ")); |
| NXPRT(fprintf(stderr, "stream may have trailing data\n")); |
| |
| /* Not an error in the most common case; it just says |
| * there is trailing data that we must examine. |
| * |
| * CC=3 CE(1)=0 CE(0)=1 indicates partial completion |
| * Fig.6-7 and Table 6-8. |
| */ |
| nx_ce = get_csb_ce_ms3b(cmdp->crb.csb); |
| |
| if (!csb_ce_termination(nx_ce) && |
| csb_ce_partial_completion(nx_ce)) { |
| /* Check CPB for more information |
| * spbc and tpbc are valid |
| */ |
| sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */ |
| subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */ |
| spbc = get32(cmdp->cpb, out_spbc_decomp); |
| tpbc = get32(cmdp->crb.csb, tpbc); |
| assert(target_max >= tpbc); |
| |
| goto ok_cc3; /* not an error */ |
| } else { |
| /* History length error when CE(1)=1 CE(0)=0. */ |
| rc = -1; |
| fprintf(stderr, "history length error cc= %d\n", cc); |
| goto err5; |
| } |
| |
| case ERR_NX_TARGET_SPACE: |
| |
| /* Target buffer not large enough; retry smaller input |
| * data; give at least 1 byte. SPBC/TPBC are not valid. |
| */ |
| assert(source_sz > history_len); |
| source_sz = ((source_sz - history_len + 2) / 2) + history_len; |
| NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with ")); |
| NXPRT(fprintf(stderr, "smaller input data src %d hist %d\n", |
| source_sz, history_len)); |
| goto restart_nx; |
| |
| case ERR_NX_OK: |
| |
| /* This should not happen for gzip formatted data; |
| * we need trailing crc and isize |
| */ |
| fprintf(stderr, "ERR_NX_OK\n"); |
| spbc = get32(cmdp->cpb, out_spbc_decomp); |
| tpbc = get32(cmdp->crb.csb, tpbc); |
| assert(target_max >= tpbc); |
| assert(spbc >= history_len); |
| source_sz = spbc - history_len; |
| goto offsets_state; |
| |
| default: |
| fprintf(stderr, "error: cc= %d\n", cc); |
| rc = -1; |
| goto err5; |
| } |
| |
| ok_cc3: |
| |
| NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt)); |
| |
| assert(spbc > history_len); |
| source_sz = spbc - history_len; |
| |
| /* Table 6-4: Source Final Block Type (SFBT) describes the |
| * last processed deflate block and clues the software how to |
| * resume the next job. SUBC indicates how many input bits NX |
| * consumed but did not process. SPBC indicates how many |
| * bytes of source were given to the accelerator including |
| * history bytes. |
| */ |
| |
| switch (sfbt) { |
| int dhtlen; |
| |
| case 0x0: /* Deflate final EOB received */ |
| |
| /* Calculating the checksum start position. */ |
| |
| source_sz = source_sz - subc / 8; |
| is_final = 1; |
| break; |
| |
| /* Resume decompression cases are below. Basically |
| * indicates where NX has suspended and how to resume |
| * the input stream. |
| */ |
| |
| case 0x8: /* Within a literal block; use rembytecount */ |
| case 0x9: /* Within a literal block; use rembytecount; bfinal=1 */ |
| |
| /* Supply the partially processed source byte again */ |
| source_sz = source_sz - ((subc + 7) / 8); |
| |
| /* SUBC LS 3bits: number of bits in the first source byte need |
| * to be processed. |
| * 000 means all 8 bits; Table 6-3 |
| * Clear subc, histlen, sfbt, rembytecnt, dhtlen |
| */ |
| cmdp->cpb.in_subc = 0; |
| cmdp->cpb.in_sfbt = 0; |
| putnn(cmdp->cpb, in_subc, subc % 8); |
| putnn(cmdp->cpb, in_sfbt, sfbt); |
| putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb, |
| out_rembytecnt)); |
| break; |
| |
| case 0xA: /* Within a FH block; */ |
| case 0xB: /* Within a FH block; bfinal=1 */ |
| |
| source_sz = source_sz - ((subc + 7) / 8); |
| |
| /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ |
| cmdp->cpb.in_subc = 0; |
| cmdp->cpb.in_sfbt = 0; |
| putnn(cmdp->cpb, in_subc, subc % 8); |
| putnn(cmdp->cpb, in_sfbt, sfbt); |
| break; |
| |
| case 0xC: /* Within a DH block; */ |
| case 0xD: /* Within a DH block; bfinal=1 */ |
| |
| source_sz = source_sz - ((subc + 7) / 8); |
| |
| /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ |
| cmdp->cpb.in_subc = 0; |
| cmdp->cpb.in_sfbt = 0; |
| putnn(cmdp->cpb, in_subc, subc % 8); |
| putnn(cmdp->cpb, in_sfbt, sfbt); |
| |
| dhtlen = getnn(cmdp->cpb, out_dhtlen); |
| putnn(cmdp->cpb, in_dhtlen, dhtlen); |
| assert(dhtlen >= 42); |
| |
| /* Round up to a qword */ |
| dhtlen = (dhtlen + 127) / 128; |
| |
| while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */ |
| --dhtlen; |
| cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen]; |
| } |
| break; |
| |
| case 0xE: /* Within a block header; bfinal=0; */ |
| /* Also given if source data exactly ends (SUBC=0) with |
| * EOB code with BFINAL=0. Means the next byte will |
| * contain a block header. |
| */ |
| case 0xF: /* within a block header with BFINAL=1. */ |
| |
| source_sz = source_sz - ((subc + 7) / 8); |
| |
| /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ |
| cmdp->cpb.in_subc = 0; |
| cmdp->cpb.in_sfbt = 0; |
| putnn(cmdp->cpb, in_subc, subc % 8); |
| putnn(cmdp->cpb, in_sfbt, sfbt); |
| |
| /* Engine did not process any data */ |
| if (is_eof && (source_sz == 0)) |
| is_final = 1; |
| } |
| |
| offsets_state: |
| |
| /* Adjust the source and target buffer offsets and lengths */ |
| |
| NXPRT(fprintf(stderr, "offsets_state:\n")); |
| |
| /* Delete input data from fifo_in */ |
| used_in = used_in - source_sz; |
| cur_in = (cur_in + source_sz) % fifo_in_len; |
| input_file_offset = input_file_offset + source_sz; |
| |
| /* Add output data to fifo_out */ |
| used_out = used_out + tpbc; |
| |
| assert(used_out <= fifo_out_len); |
| |
| total_out = total_out + tpbc; |
| |
| /* Deflate history is 32KB max. No need to supply more |
| * than 32KB on a resume. |
| */ |
| history_len = (total_out > window_max) ? window_max : total_out; |
| |
| /* To estimate expected expansion in the next NX job; 500 means 50%. |
| * Deflate best case is around 1 to 1000. |
| */ |
| last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1)) |
| / ((uint64_t)tpbc + 1); |
| last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1); |
| NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n", |
| last_comp_ratio, source_sz, spbc, tpbc)); |
| |
| resuming = 1; |
| |
| finish_state: |
| |
| NXPRT(fprintf(stderr, "finish_state:\n")); |
| |
| if (is_final) { |
| if (used_out) |
| goto write_state; /* More data to write out */ |
| else if (used_in < 8) { |
| /* Need at least 8 more bytes containing gzip crc |
| * and isize. |
| */ |
| rc = -1; |
| goto err4; |
| } else { |
| /* Compare checksums and exit */ |
| int i; |
| unsigned char tail[8]; |
| uint32_t cksum, isize; |
| |
| for (i = 0; i < 8; i++) |
| tail[i] = fifo_in[(cur_in + i) % fifo_in_len]; |
| fprintf(stderr, "computed checksum %08x isize %08x\n", |
| cmdp->cpb.out_crc, (uint32_t) (total_out |
| % (1ULL<<32))); |
| cksum = ((uint32_t) tail[0] | (uint32_t) tail[1]<<8 |
| | (uint32_t) tail[2]<<16 |
| | (uint32_t) tail[3]<<24); |
| isize = ((uint32_t) tail[4] | (uint32_t) tail[5]<<8 |
| | (uint32_t) tail[6]<<16 |
| | (uint32_t) tail[7]<<24); |
| fprintf(stderr, "stored checksum %08x isize %08x\n", |
| cksum, isize); |
| |
| if (cksum == cmdp->cpb.out_crc && isize == (uint32_t) |
| (total_out % (1ULL<<32))) { |
| rc = 0; goto ok1; |
| } else { |
| rc = -1; goto err4; |
| } |
| } |
| } else |
| goto read_state; |
| |
| return -1; |
| |
| err1: |
| fprintf(stderr, "error: not a gzip file, expect %x, read %x\n", |
| expect, c); |
| return -1; |
| |
| err2: |
| fprintf(stderr, "error: the FLG byte is wrong or not being handled\n"); |
| return -1; |
| |
| err3: |
| fprintf(stderr, "error: gzip header\n"); |
| return -1; |
| |
| err4: |
| fprintf(stderr, "error: checksum missing or mismatch\n"); |
| |
| err5: |
| ok1: |
| fprintf(stderr, "decomp is complete: fclose\n"); |
| fclose(outf); |
| |
| return rc; |
| } |
| |
| |
| int main(int argc, char **argv) |
| { |
| int rc; |
| struct sigaction act; |
| void *handle; |
| |
| nx_dbg = 0; |
| nx_gzip_log = NULL; |
| act.sa_handler = 0; |
| act.sa_sigaction = nxu_sigsegv_handler; |
| act.sa_flags = SA_SIGINFO; |
| act.sa_restorer = 0; |
| sigemptyset(&act.sa_mask); |
| sigaction(SIGSEGV, &act, NULL); |
| |
| handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0); |
| if (!handle) { |
| fprintf(stderr, "Unable to init NX, errno %d\n", errno); |
| exit(-1); |
| } |
| |
| rc = decompress_file(argc, argv, handle); |
| |
| nx_function_end(handle); |
| |
| return rc; |
| } |