/* 2bit.c - pack and unpack gene data using 2bit compression Copyright (C) 2003 John Comeau This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include #include #include #define BUFFERSIZE 4096 #define LINELENGTH 50 /* bases per line in FASTA file */ /*#define DEBUG 1 /* comment this out for release */ const char *packstring = "GATCgatcNnMR"; extern void exit(int); const char *RCSid = "$Id: 2bit.c,v 1.46 2004/08/04 05:30:45 jcomeau Exp $"; void skipheader(FILE *file) { char inbuffer[BUFFERSIZE]; char *header; #ifdef DEBUG fprintf(stderr, "skipping FASTA header\n"); #endif // side effect only; skips ">chr10\n" at beginning of FASTA file if ((header = fgets(inbuffer, BUFFERSIZE, file)) == NULL) { perror("Could not check file header"); exit(1); } if (*header != '>') { fprintf(stderr, "%s\n", "Illegal header: must start with '>'"); exit(1); } } char *outputfilename(char *filename, char *newext, char *newname) { #ifdef DEBUG fprintf(stderr, "appending extension '%s' to to output for filename %s\n", newext, filename); #endif int i; for (i = strlen(filename) - 1; i >= 0; i--) { if (filename[i] == '/' || filename[i] == '\\') { strcpy(newname, &filename[i + 1]); strcat(newname, newext); return newname; } } // no path separators were in the filename, so... strcpy(newname, filename); strcat(newname, newext); return newname; } int skippackedheader(FILE *file, char *origheader, unsigned long *length) { #ifdef DEBUG fprintf(stderr, "skipping 2bit header\n"); #endif // skips ">chr10:1-10\nP" at beginning of 2bit file, stores 'chr10' and 10 char byte; fscanf(file, "%[^:]:%*d-%lu\n", origheader, length); if (*length < 1) { fprintf(stderr, "Illegal header: sequence length less than 1\n"); exit(1); } while ((byte = getc(file)) != 'P') { if (byte > ' ') { fprintf(stderr, "Illegal header: '%c' between linefeed and P\n", byte); exit(1); } } return 0; } char *header(char *buffer, int bufsize, FILE *file) { #ifdef DEBUG fprintf(stderr, "reading in file header\n"); #endif char *i; fread(buffer, 1, bufsize, file); i = strchr(buffer, '\n'); if (i == NULL) { fprintf(stderr, "%s\n", "Header too long for me"); exit(1); } /* trim the header */ while (*i <= ' ') { *i-- = 0; } return buffer; } int countbases(char *buffer, int bufsize) { #ifdef DEBUG fprintf(stderr, "counting bases in %d-byte buffer\n", bufsize); #endif // returns count of anything in 'GATCgatcNnMR' in buffer int i; char *legal = "GATCgatcNnMR\n\r\t "; int count = 0; for (i = 0; i < bufsize; i++) { if (strchr(legal, buffer[i]) != NULL) { if (buffer[i] > ' ') { //fprintf(stderr, "counting '%c'\n", buffer[i]); count++; } } else { fprintf(stderr, "Illegal character '%c' in buffer\n", buffer[i]); exit(1); } } return count; } long countfile(char *infile, int print) { #ifdef DEBUG fprintf(stderr, "counting bases in file %s\n", infile); #endif FILE *file; char inbuffer[BUFFERSIZE]; int bufsize = BUFFERSIZE; int howmany; long basecount = 0; file = fopen(infile, "rb"); if (file == NULL) { perror("Could not open input file"); exit(1); } skipheader(file); while ((howmany = fread(inbuffer, 1, bufsize, file)) != 0) { basecount += countbases(inbuffer, howmany); } fclose(file); if (print) printf("%ld\n", basecount); return basecount; } int packvalue(char byte) { #ifdef DEBUG fprintf(stderr, "packing '%c'\n", byte); #endif char *i; if ((i = strchr(packstring, byte)) != NULL) { return (int)(i - packstring) % 4; } else { return -1; } } int maskvalue(char byte) { #ifdef DEBUG fprintf(stderr, "generating mask for '%c'\n", byte); #endif char *i; if ((i = strchr(packstring, byte)) != NULL) { return (int)(i - packstring) / 4; } else { return -1; } } char unmask(char byte, char mask) { #ifdef DEBUG fprintf(stderr, "unmasking char '%c' with mask %d\n", byte, mask); #endif int offset, unmasked; offset = (int)(strchr(packstring, byte) - packstring); if (offset > 3) { fprintf(stderr, "'%c' not found in first 4 bytes of '%s'\n", byte, packstring); exit(1); } unmasked = packstring[(mask * 4) + offset]; #ifdef DEBUG fprintf(stderr, "unmasked char '%c' is '%c'\n", byte, unmasked); #endif return unmasked; } int unpack(char *infile) { #ifdef DEBUG fprintf(stderr, "unpacking file %s\n", infile); #endif char origheader[BUFFERSIZE]; char outfile[BUFFERSIZE]; FILE *file; FILE *output; char mask; long basecount = 0; long beginpacked = 0; long begindata = 0; char dataline[BUFFERSIZE]; unsigned char packedbyte; int linepointer = 0; int packedpointer = 3; // starts at high 2 bits (3 << (2 * 3)) long outputpointer; long processed = 0; int i; if ((file = fopen(infile, "rb")) == NULL) { perror("Could not open input file"); exit(1); } if ((output = fopen(outputfilename(infile, ".fa", outfile), "wb")) == \ NULL) { perror("Could not create output file"); exit(1); } skippackedheader(file, origheader, &basecount); beginpacked = ftell(file); fputs(origheader, output); fputc('\n', output); begindata = ftell(output); while (processed < basecount) { packedbyte = getc(file); for (i = 0; i < 4; i++) { dataline[linepointer++] = packstring[(packedbyte & \ (3 << 2 * packedpointer)) >> \ (2 * packedpointer)]; packedpointer--; packedpointer &= 3; // when it goes below zero, this will reset it if (linepointer == LINELENGTH) { dataline[linepointer] = 0; fputs(dataline, output); fputc('\n', output); linepointer = 0; } if (++processed == basecount) break; } } if (linepointer > 0) { dataline[linepointer] = 0; fputs(dataline, output); fputc('\n', output); linepointer = 0; } fclose(output); // only for a moment, now we need to correct it if ((output = fopen(outfile, "rb+")) == NULL) { fprintf(stderr, "%s\n", "Could not correct output\n"); exit(1); } processed = 0; packedpointer = 3; // reset pointer /* get the first line, then reset file pointer so we can overwrite it */ if (fseek(output, begindata, SEEK_SET) == EOF) perror("SEEK error"); fgets(dataline, BUFFERSIZE, output); if (fseek(output, begindata, SEEK_SET) == EOF) perror("SEEK error"); while (processed < basecount) { packedbyte = getc(file); for (i = 0; i < 4; i++) { mask = (packedbyte & (3 << 2 * packedpointer)) >> (2 * packedpointer); packedpointer--; packedpointer &= 3; // when it goes below zero, this will reset it dataline[linepointer] = unmask(dataline[linepointer], mask); linepointer++; if (linepointer == LINELENGTH) { dataline[linepointer] = 0; fputs(dataline, output); fputc('\n', output); linepointer = 0; } if (++processed == basecount) break; if (linepointer == 0) { outputpointer = ftell(output); fgets(dataline, BUFFERSIZE, output); if (fseek(output, outputpointer, SEEK_SET) == EOF) perror("SEEK error"); } } } if (linepointer > 0) { dataline[linepointer] = 0; fputs(dataline, output); fputc('\n', output); linepointer = 0; } fclose(output); return 0; } int pack(char *infile) { #ifdef DEBUG fprintf(stderr, "packing file %s\n", infile); #endif FILE *file; FILE *outfile; int bufsize = BUFFERSIZE; char inbuffer[BUFFERSIZE]; char copyheader[BUFFERSIZE]; long count = 0L; int howmany; char packed = 0; char offset = 0; int packedvalue; int i; long basecount = countfile(infile, 0); if ((file = fopen(infile, "rb")) == NULL) { perror("Cannot open input file"); exit(1); } if ((outfile = \ fopen(outputfilename(infile, ".2bit", inbuffer), "wb")) == NULL) { perror("Cannot create output file"); exit(1); } fseek(file, 0L, SEEK_SET); strncpy(copyheader, header(inbuffer, bufsize, file), BUFFERSIZE); sprintf(inbuffer, "%s:1-%lu\nP", copyheader, basecount); fwrite(inbuffer, 1, strlen(inbuffer), outfile); fseek(file, 0L, SEEK_SET); skipheader(file); while (count < basecount) { while ((howmany = fread(inbuffer, 1, bufsize, file)) != 0) { for (i = 0; i < howmany; i++) { if ((packedvalue = packvalue(inbuffer[i])) != -1) { packed <<= 2; packed |= packedvalue; offset = (offset + 1) % 4; count++; if (offset == 0) { fwrite(&packed, 1, 1, outfile); packed = 0; } } } } // time to pack up and output any remaining data if (offset != 0) { while (offset != 0) { packed <<= 2; offset = (offset + 1) % 4; } fwrite(&packed, 1, 1, outfile); } } // done with the data, now output the maskinfo packed = 0; count = 0L; fseek(file, 0L, SEEK_SET); skipheader(file); while (count < basecount) { while ((howmany = fread(inbuffer, 1, bufsize, file)) != 0) { for (i = 0; i < howmany; i++) { if ((packedvalue = maskvalue(inbuffer[i])) != -1) { packed <<= 2; packed |= packedvalue; offset = (offset + 1) % 4; count++; if (offset == 0) { fwrite(&packed, 1, 1, outfile); packed = 0; } } } } // time to pack up and output any remaining maskinfo if (offset != 0) { while (offset != 0) { packed <<= 2; offset = (offset + 1) % 4; } fwrite(&packed, 1, 1, outfile); } } /* end 'while' loop */ return 0; } int main(int argc, char **argv) { int i; if (argc < 2) { fprintf(stderr, "Usage: %s %s %s\n", argv[0], "COMMAND", "INFILE..."); fprintf(stderr, "%s\n", " COMMAND is one of: 'count', 'pack', 'unpack'"); } else if (strcmp(argv[1], "count") == 0) { for (i = 2; i < argc; i++) { countfile(argv[i], 1); } } else if (strcmp(argv[1], "pack") == 0) { for (i = 2; i < argc; i++) { pack(argv[i]); } } else if (strcmp(argv[1], "unpack") == 0) { for (i = 2; i < argc; i++) { unpack(argv[i]); } } else { fprintf(stderr, "Bad parameter, run %s without args for help\n", argv[0]); return 1; } return 0; }