shithub: riscv

ref: f42e53655e9a2a1b516326f6522fba88db59a81c
dir: /sys/src/cmd/file.c/

View raw version
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <ctype.h>
#include <mach.h>

/*
 * file - determine type of file
 */
#define	LENDIAN(p)	((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))

uchar	buf[6001];
short	cfreq[140];
short	wfreq[50];
int	nbuf;
Dir*	mbuf;
int	fd;
char 	*fname;
char	*slash;

enum
{
	Cword,
	Fword,
	Aword,
	Alword,
	Lword,
	I1,
	I2,
	I3,
	Clatin	= 128,
	Cbinary,
	Cnull,
	Ceascii,
	Cutf,
};
struct
{
	char*	word;
	int	class;
} dict[] =
{
	"PATH",		Lword,
	"TEXT",		Aword,
	"adt",		Alword,
	"aggr",		Alword,
	"alef",		Alword,
	"array",	Lword,
	"block",	Fword,
	"char",		Cword,
	"common",	Fword,
	"con",		Lword,
	"data",		Fword,
	"dimension",	Fword,
	"double",	Cword,
	"extern",	Cword,
	"bio",		I2,
	"float",	Cword,
	"fn",		Lword,
	"function",	Fword,
	"h",		I3,
	"implement",	Lword,
	"import",	Lword,
	"include",	I1,
	"int",		Cword,
	"integer",	Fword,
	"iota",		Lword,
	"libc",		I2,
	"long",		Cword,
	"module",	Lword,
	"real",		Fword,
	"ref",		Lword,
	"register",	Cword,
	"self",		Lword,
	"short",	Cword,
	"static",	Cword,
	"stdio",	I2,
	"struct",	Cword,
	"subroutine",	Fword,
	"u",		I2,
	"void",		Cword,
};

/* codes for 'mode' field in language structure */
enum	{
		Normal	= 0,
		First,		/* first entry for language spanning several ranges */
		Multi,		/* later entries "   "       "  ... */
		Shared,		/* codes used in several languages */
	};

struct
{
	int	mode;		/* see enum above */
	int 	count;
	int	low;
	int	high;
	char	*name;

} language[] =
{
	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
	Normal,	0,	0x0370,	0x03FF,	"Greek",
	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
	Normal,	0,	0x0530,	0x058F,	"Armenian",
	Normal,	0,	0x0590,	0x05FF,	"Hebrew",
	Normal,	0,	0x0600,	0x06FF,	"Arabic",
	Normal,	0,	0x0900,	0x097F,	"Devanagari",
	Normal,	0,	0x0980,	0x09FF,	"Bengali",
	Normal,	0,	0x0A00,	0x0A7F,	"Gurmukhi",
	Normal,	0,	0x0A80,	0x0AFF,	"Gujarati",
	Normal,	0,	0x0B00,	0x0B7F,	"Oriya",
	Normal,	0,	0x0B80,	0x0BFF,	"Tamil",
	Normal,	0,	0x0C00,	0x0C7F,	"Telugu",
	Normal,	0,	0x0C80,	0x0CFF,	"Kannada",
	Normal,	0,	0x0D00,	0x0D7F,	"Malayalam",
	Normal,	0,	0x0E00,	0x0E7F,	"Thai",
	Normal,	0,	0x0E80,	0x0EFF,	"Lao",
	Normal,	0,	0x1000,	0x105F,	"Tibetan",
	Normal,	0,	0x10A0,	0x10FF,	"Georgian",
	Normal,	0,	0x3040,	0x30FF,	"Japanese",
	Normal,	0,	0x3100,	0x312F,	"Chinese",
	First,	0,	0x3130,	0x318F,	"Korean",
	Multi,	0,	0x3400,	0x3D2F,	"Korean",
	Shared,	0,	0x4e00,	0x9fff,	"CJK",
	Normal,	0,	0,	0,	0,		/* terminal entry */
};


enum
{
	Fascii,		/* printable ascii */
	Flatin,		/* latin 1*/
	Futf,		/* UTF character set */
	Fbinary,	/* binary */
	Feascii,	/* ASCII with control chars */
	Fnull,		/* NULL in file */
} guess;

void	bump_utf_count(Rune);
int	cistrncmp(char*, char*, int);
void	filetype(int);
int	getfontnum(uchar*, uchar**);
int	isas(void);
int	isc(void);
int	iscint(void);
int	isenglish(void);
int	ishp(void);
int	ishtml(void);
int	isrfc822(void);
int	ismbox(void);
int	islimbo(void);
int	istga(void);
int	ismp3(void);
int	ismung(void);
int	isp9bit(void);
int	isp9font(void);
int	isrtf(void);
int	ismsdos(void);
int	isicocur(void);
int	iself(void);
int	istring(void);
int	isoffstr(void);
int	iff(void);
int	long0(void);
int	longoff(void);
int	istar(void);
int	isface(void);
int	isexec(void);
int	p9bitnum(char*, int*);
int	p9subfont(uchar*);
void	print_utf(void);
void	type(char*, int);
int	utf_count(void);
void	wordfreq(void);

int	(*call[])(void) =
{
	long0,		/* recognizable by first 4 bytes */
	istring,	/* recognizable by first string */
	iself,		/* ELF (foreign) executable */
	isexec,		/* native executables */
	iff,		/* interchange file format (strings) */
	longoff,	/* recognizable by 4 bytes at some offset */
	isoffstr,	/* recognizable by string at some offset */
	isrfc822,	/* email file */
	ismbox,		/* mail box */
	istar,		/* recognizable by tar checksum */
	iscint,		/* compiler/assembler intermediate */
	ishtml,		/* html keywords */
	islimbo,	/* limbo source */
	isc,		/* c & alef compiler key words */
	isas,		/* assembler key words */
	isp9font,	/* plan 9 font */
	isp9bit,	/* plan 9 image (as from /dev/window) */
	isrtf,		/* rich text format */
	ismsdos,	/* msdos exe (virus file attachement) */
	isicocur,		/* windows icon or cursor file */
	isface,		/* ascii face file */
	istga,
	ismp3,

	/* last resorts */
	ismung,		/* entropy compressed/encrypted */
	isenglish,	/* char frequency English */
	0
};

int mime;

char OCTET[] =	"application/octet-stream";
char PLAIN[] =	"text/plain";

void
main(int argc, char *argv[])
{
	int i, j, maxlen;
	char *cp;
	Rune r;

	ARGBEGIN{
	case 'm':
		mime = 1;
		break;
	default:
		fprint(2, "usage: file [-m] [file...]\n");
		exits("usage");
	}ARGEND;

	maxlen = 0;
	if(mime == 0 || argc > 1){
		for(i = 0; i < argc; i++) {
			for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
					;
			if(j > maxlen)
				maxlen = j;
		}
	}
	if (argc <= 0) {
		if(!mime)
			print ("stdin: ");
		filetype(0);
	}
	else {
		for(i = 0; i < argc; i++)
			type(argv[i], maxlen);
	}
	exits(0);
}

void
type(char *file, int nlen)
{
	Rune r;
	int i;
	char *p;

	if(nlen > 0){
		slash = 0;
		for (i = 0, p = file; *p; i++) {
			if (*p == '/')			/* find rightmost slash */
				slash = p;
			p += chartorune(&r, p);		/* count runes */
		}
		print("%s:%*s",file, nlen-i+1, "");
	}
	fname = file;
	if ((fd = open(file, OREAD)) < 0) {
		fprint(2, "cannot open: %r\n");
		return;
	}
	filetype(fd);
	close(fd);
}

void
utfconv(void)
{
	Rune r;
	uchar *rb;
	char *p, *e;
	int i;

	if(nbuf < 4)
		return;

	if(memcmp(buf, "\x00\x00\xFE\xFF", 4) == 0){
		if(!mime)
			print("utf-32be ");
		return;
	} else
	if(memcmp(buf, "\xFE\xFF\x00\x00", 4) == 0){
		if(!mime)
			print("utf-32le ");
		return;
	} else
	if(memcmp(buf, "\xEF\xBB\xBF", 3) == 0){
		memmove(buf, buf+3, nbuf-3);
		nbuf -= 3;
		return;
	} else
	if(memcmp(buf, "\xFE\xFF", 2) == 0){
		if(!mime)
			print("utf-16be ");

		nbuf -= 2;
		rb = malloc(nbuf+1);
		memmove(rb, buf+2, nbuf);
		p = (char*)buf;
		e = p+sizeof(buf)-UTFmax-1;
		for(i=0; i<nbuf && p < e; i+=2){
			r = rb[i+1] | rb[i]<<8;
			p += runetochar(p, &r);
		}
		*p = 0;
		free(rb);
		nbuf = p - (char*)buf;
	} else
	if(memcmp(buf, "\xFF\xFE", 2) == 0){
		if(!mime)
			print("utf-16le ");

		nbuf -= 2;
		rb = malloc(nbuf+1);
		memmove(rb, buf+2, nbuf);
		p = (char*)buf;
		e = p+sizeof(buf)-UTFmax-1;
		for(i=0; i<nbuf && p < e; i+=2){
			r = rb[i] | rb[i+1]<<8;
			p += runetochar(p, &r);
		}
		*p = 0;
		free(rb);
		nbuf = p - (char*)buf;
	}
}

void
filetype(int fd)
{
	Rune r;
	int i, f, n;
	char *p, *eob;

	free(mbuf);
	mbuf = dirfstat(fd);
	if(mbuf == nil){
		fprint(2, "cannot stat: %r\n");
		return;
	}
	if(mbuf->mode & DMDIR) {
		print("%s\n", mime ? OCTET : "directory");
		return;
	}
	if(mbuf->type != 'M' && mbuf->type != '|') {
		if(mime)
			print("%s\n", OCTET);
		else
			print("special file #%C/%s\n", mbuf->type, mbuf->name);
		return;
	}
	/* may be reading a pipe on standard input */
	nbuf = readn(fd, buf, sizeof(buf)-1);
	if(nbuf < 0) {
		fprint(2, "cannot read: %r\n");
		return;
	}
	if(nbuf == 0) {
		print("%s\n", mime ? PLAIN : "empty file");
		return;
	}
	buf[nbuf] = 0;

	utfconv();

	/*
	 * build histogram table
	 */
	memset(cfreq, 0, sizeof(cfreq));
	for (i = 0; language[i].name; i++)
		language[i].count = 0;
	eob = (char *)buf+nbuf;
	for(n = 0, p = (char *)buf; p < eob; n++) {
		if (!fullrune(p, eob-p) && eob-p < UTFmax)
			break;
		p += chartorune(&r, p);
		if (r == 0)
			f = Cnull;
		else if (r <= 0x7f) {
			if (!isprint(r) && !isspace(r))
				f = Ceascii;	/* ASCII control char */
			else f = r;
		} else if (r == 0x80) {
			bump_utf_count(r);
			f = Cutf;
		} else if (r < 0xA0)
			f = Cbinary;	/* Invalid Runes */
		else if (r <= 0xff)
			f = Clatin;	/* Latin 1 */
		else {
			bump_utf_count(r);
			f = Cutf;		/* UTF extension */
		}
		cfreq[f]++;			/* ASCII chars peg directly */
	}
	/*
	 * gross classify
	 */
	if (cfreq[Cbinary])
		guess = Fbinary;
	else if (cfreq[Cutf])
		guess = Futf;
	else if (cfreq[Clatin])
		guess = Flatin;
	else if (cfreq[Ceascii])
		guess = Feascii;
	else if (cfreq[Cnull])
		guess = Fbinary;
	else
		guess = Fascii;
	/*
	 * lookup dictionary words
	 */
	memset(wfreq, 0, sizeof(wfreq));
	if(guess == Fascii || guess == Flatin || guess == Futf)
		wordfreq();
	/*
	 * call individual classify routines
	 */
	for(i=0; call[i]; i++)
		if((*call[i])())
			return;

	/*
	 * if all else fails,
	 * print out gross classification
	 */
	if (nbuf < 100 && !mime)
		print(mime ? PLAIN : "short ");
	if (guess == Fascii)
		print("%s\n", mime ? PLAIN : "Ascii");
	else if (guess == Feascii)
		print("%s\n", mime ? PLAIN : "extended ascii");
	else if (guess == Flatin)
		print("%s\n", mime ? PLAIN : "latin ascii");
	else if (guess == Futf && utf_count() < 4)
		print_utf();
	else print("%s\n", mime ? OCTET : "binary");
}

void
bump_utf_count(Rune r)
{
	int low, high, mid;

	high = sizeof(language)/sizeof(language[0])-1;
	for (low = 0; low < high;) {
		mid = (low+high)/2;
		if (r >= language[mid].low) {
			if (r <= language[mid].high) {
				language[mid].count++;
				break;
			} else low = mid+1;
		} else high = mid;
	}
}

int
utf_count(void)
{
	int i, count;

	count = 0;
	for (i = 0; language[i].name; i++)
		if (language[i].count > 0)
			switch (language[i].mode) {
			case Normal:
			case First:
				count++;
				break;
			default:
				break;
			}
	return count;
}

int
chkascii(void)
{
	int i;

	for (i = 'a'; i < 'z'; i++)
		if (cfreq[i])
			return 1;
	for (i = 'A'; i < 'Z'; i++)
		if (cfreq[i])
			return 1;
	return 0;
}

int
find_first(char *name)
{
	int i;

	for (i = 0; language[i].name != 0; i++)
		if (language[i].mode == First
			&& strcmp(language[i].name, name) == 0)
			return i;
	return -1;
}

void
print_utf(void)
{
	int i, printed, j;

	if(mime){
		print("%s\n", PLAIN);
		return;
	}
	if (chkascii()) {
		printed = 1;
		print("Ascii");
	} else
		printed = 0;
	for (i = 0; language[i].name; i++)
		if (language[i].count) {
			switch(language[i].mode) {
			case Multi:
				j = find_first(language[i].name);
				if (j < 0)
					break;
				if (language[j].count > 0)
					break;
				/* Fall through */
			case Normal:
			case First:
				if (printed)
					print(" & ");
				else printed = 1;
				print("%s", language[i].name);
				break;
			case Shared:
			default:
				break;
			}
		}
	if(!printed)
		print("UTF");
	print(" text\n");
}

void
wordfreq(void)
{
	int low, high, mid, r;
	uchar *p, *p2, c;

	p = buf;
	for(;;) {
		while (p < buf+nbuf && !isalpha(*p))
			p++;
		if (p >= buf+nbuf)
			return;
		p2 = p;
		while(p < buf+nbuf && isalpha(*p))
			p++;
		c = *p;
		*p = 0;
		high = sizeof(dict)/sizeof(dict[0]);
		for(low = 0;low < high;) {
			mid = (low+high)/2;
			r = strcmp(dict[mid].word, (char*)p2);
			if(r == 0) {
				wfreq[dict[mid].class]++;
				break;
			}
			if(r < 0)
				low = mid+1;
			else
				high = mid;
		}
		*p++ = c;
	}
}

typedef struct Filemagic Filemagic;
struct Filemagic {
	ulong x;
	ulong mask;
	char *desc;
	char *mime;
};

/*
 * integers in this table must be as seen on a little-endian machine
 * when read from a file.
 */
Filemagic long0tab[] = {
	0xF16DF16D,	0xFFFFFFFF,	"pac1 audio file",	OCTET,
	/* "pac1" */
	0x31636170,	0xFFFFFFFF,	"pac3 audio file",	OCTET,
	/* "pXc2 */
	0x32630070,	0xFFFF00FF,	"pac4 audio file",	OCTET,
	0xBA010000,	0xFFFFFFFF,	"mpeg system stream",	OCTET,
	0x43614c66,	0xFFFFFFFF,	"FLAC audio file",	"audio/flac",
	0x30800CC0,	0xFFFFFFFF,	"inferno .dis executable", OCTET,
	0x04034B50,	0xFFFFFFFF,	"zip archive", "application/zip",
	070707,		0xFFFF,		"cpio archive", "application/x-cpio",
	0x2F7,		0xFFFF,		"tex dvi", "application/dvi",
	0xfaff,		0xfeff,		"mp3 audio",	"audio/mpeg",
	0xf0ff,		0xf6ff,		"aac audio",	"audio/mpeg",
	/* 0xfeedface: this could alternately be a Next Plan 9 boot image */
	0xcefaedfe,	0xFFFFFFFF,	"32-bit power Mach-O executable", OCTET,
	/* 0xfeedfacf */
	0xcffaedfe,	0xFFFFFFFF,	"64-bit power Mach-O executable", OCTET,
	/* 0xcefaedfe */
	0xfeedface,	0xFFFFFFFF,	"386 Mach-O executable", OCTET,
	/* 0xcffaedfe */
	0xfeedfacf,	0xFFFFFFFF,	"amd64 Mach-O executable", OCTET,
	/* 0xcafebabe */
	0xbebafeca,	0xFFFFFFFF,	"Mach-O universal executable", OCTET,
	/*
	 * venti & fossil magic numbers are stored big-endian on disk,
	 * thus the numbers appear reversed in this table.
	 */
	0xad4e5cd1,	0xFFFFFFFF,	"venti arena", OCTET,
	0x2bb19a52,	0xFFFFFFFF,	"paq archive", OCTET,
	0x1a53454e,	0xFFFFFFFF,	"NES ROM", OCTET,
	/* tcpdump pcap file */
	0xa1b2c3d4,	0xFFFFFFFF,	"pcap file",	"application/vnd.tcpdump.pcap",
	0xd4c3b2a1,	0xFFFFFFFF,	"pcap file",	"application/vnd.tcpdump.pcap",
	0xa1b23c4d,	0xFFFFFFFF,	"pcap file",	"application/vnd.tcpdump.pcap",
	0x4d3cb2a1,	0xFFFFFFFF,	"pcap file",	"application/vnd.tcpdump.pcap",
};

int
filemagic(Filemagic *tab, int ntab, ulong x)
{
	int i;

	for(i=0; i<ntab; i++)
		if((x&tab[i].mask) == tab[i].x){
			print("%s\n", mime ? tab[i].mime : tab[i].desc);
			return 1;
		}
	return 0;
}

int
long0(void)
{
	return filemagic(long0tab, nelem(long0tab), LENDIAN(buf));
}

typedef struct Fileoffmag Fileoffmag;
struct Fileoffmag {
	ulong	off;
	Filemagic;
};

/*
 * integers in this table must be as seen on a little-endian machine
 * when read from a file.
 */
Fileoffmag longofftab[] = {
	/*
	 * venti & fossil magic numbers are stored big-endian on disk,
	 * thus the numbers appear reversed in this table.
	 */
	256*1024, 0xe7a5e4a9, 0xFFFFFFFF, "venti arenas partition", OCTET,
	256*1024, 0xc75e5cd1, 0xFFFFFFFF, "venti index section", OCTET,
	128*1024, 0x89ae7637, 0xFFFFFFFF, "fossil write buffer", OCTET,
	4,	  0x31647542, 0xFFFFFFFF, "OS X finder properties", OCTET,
	0x100,	  0x41474553, 0xFFFFFFFF, "SEGA ROM", OCTET,
	0x1fc,	  0xAA550000, 0xFFFF0000, "bootable disk image", OCTET,
};

int
fileoffmagic(Fileoffmag *tab, int ntab)
{
	int i;
	ulong x;
	Fileoffmag *tp;
	uchar buf[sizeof(long)];

	for(i=0; i<ntab; i++) {
		tp = tab + i;
		seek(fd, tp->off, 0);
		if (readn(fd, buf, sizeof buf) != sizeof buf)
			continue;
		x = LENDIAN(buf);
		if((x&tp->mask) == tp->x){
			print("%s\n", mime ? tp->mime : tp->desc);
			return 1;
		}
	}
	return 0;
}

int
longoff(void)
{
	return fileoffmagic(longofftab, nelem(longofftab));
}

int
isexec(void)
{
	Fhdr f;

	seek(fd, 0, 0);		/* reposition to start of file */
	if(crackhdr(fd, &f)) {
		print("%s\n", mime ? OCTET : f.name);
		return 1;
	}
	return 0;
}


/* from tar.c */
enum { NAMSIZ = 100, TBLOCK = 512 };

union	hblock
{
	char	dummy[TBLOCK];
	struct	header
	{
		char	name[NAMSIZ];
		char	mode[8];
		char	uid[8];
		char	gid[8];
		char	size[12];
		char	mtime[12];
		char	chksum[8];
		char	linkflag;
		char	linkname[NAMSIZ];
		/* rest are defined by POSIX's ustar format; see p1003.2b */
		char	magic[6];	/* "ustar" */
		char	version[2];
		char	uname[32];
		char	gname[32];
		char	devmajor[8];
		char	devminor[8];
		char	prefix[155];  /* if non-null, path = prefix "/" name */
	} dbuf;
};

int
checksum(union hblock *hp)
{
	int i;
	char *cp;
	struct header *hdr = &hp->dbuf;

	for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
		*cp = ' ';
	i = 0;
	for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
		i += *cp & 0xff;
	return i;
}

int
istar(void)
{
	int chksum;
	char tblock[TBLOCK];
	union hblock *hp = (union hblock *)tblock;
	struct header *hdr = &hp->dbuf;

	seek(fd, 0, 0);		/* reposition to start of file */
	if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
		return 0;
	chksum = strtol(hdr->chksum, 0, 8);
	if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
		if (strcmp(hdr->magic, "ustar") == 0)
			print(mime? "application/x-ustar\n": "posix tar archive\n");
		else
			print(mime? "application/x-tar\n": "tar archive\n");
		return 1;
	}
	return 0;
}

/*
 * initial words to classify file
 */
struct	FILE_STRING
{
	char 	*key;
	char	*filetype;
	int	length;
	char	*mime;
} file_string[] =
{
	"\x1f\x9d",		"compressed",			2,	"application/x-compress",
	"\x1f\x8b",		"gzip compressed",		2,	"application/x-gzip",
	"BZh",			"bzip2 compressed",		3,	"application/x-bzip2",
	"!<arch>\n__.SYMDEF",	"archive random library",	16,	OCTET,
	"!<arch>\n",		"archive",			8,	OCTET,
	"070707",		"cpio archive - ascii header",	6,	OCTET,
	"#!/bin/rc",		"rc executable file",		9,	PLAIN,
	"#!/bin/sh",		"sh executable file",		9,	PLAIN,
	"%!",			"postscript",			2,	"application/postscript",
	"\004%!",		"postscript",			3,	"application/postscript",
	"x T post",		"troff output for post",	8,	"application/troff",
	"x T Latin1",		"troff output for Latin1",	10,	"application/troff",
	"x T utf",		"troff output for UTF",		7,	"application/troff",
	"x T 202",		"troff output for 202",		7,	"application/troff",
	"x T aps",		"troff output for aps",		7,	"application/troff",
	"x T ",			"troff output",			4,	"application/troff",
	"GIF",			"GIF image", 			3,	"image/gif",
	"\0PC Research, Inc\0",	"ghostscript fax file",		18,	"application/ghostscript",
	"%PDF",			"PDF",				4,	"application/pdf",
	"<!DOCTYPE",		"HTML file",			9,	"text/html",
	"<!doctype",		"HTML file",			9,	"text/html",
	"<!--",			"HTML file",			4,	"text/html",
	"<html>",		"HTML file",			6,	"text/html",
	"<HTML>",		"HTML file",			6,	"text/html",
	"<?xml",		"HTML file",			5,	"text/html",
	"\111\111\052\000",	"tiff",				4,	"image/tiff",
	"\115\115\000\052",	"tiff",				4,	"image/tiff",
	"\377\330\377\340",	"jpeg",				4,	"image/jpeg",
	"\377\330\377\341",	"jpeg",				4,	"image/jpeg",
	"\377\330\377\333",	"jpeg",				4,	"image/jpeg",
	"\xff\xd8",		"jpeg",				2,	"image/jpeg",
	"BM",			"bmp",				2,	"image/bmp", 
	"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",	"microsoft office document",	8,	"application/doc",
	"<MakerFile ",		"FrameMaker file",		11,	"application/framemaker",
	"\033E\033",		"HP PCL printer data",		3,	OCTET,
	"\033&",		"HP PCL printer data",		2,	OCTET,
	"\033%-12345X",		"HPJCL file",		9,	"application/hpjcl",
	"\033Lua",		"Lua bytecode",		4,	OCTET,
	"ID3",			"mp3 audio with id3",	3,	"audio/mpeg",
	"OggS",			"ogg audio",		4,	"audio/ogg",
	".snd",			"sun audio",		4,	"audio/basic",
	"\211PNG",		"PNG image",		4,	"image/png",
	"P1\n",			"ppm",			3,	"image/ppm",
	"P2\n",			"ppm",			3,	"image/ppm",
	"P3\n",			"ppm",			3,	"image/ppm",
	"P4\n",			"ppm",			3,	"image/ppm",
	"P5\n",			"ppm",			3,	"image/ppm",
	"P6\n",			"ppm",			3,	"image/ppm",
	"/* XPM */\n",	"xbm",				10,	"image/xbm",
	".HTML ",		"troff -ms input",	6,	"text/troff",
	".LP",			"troff -ms input",	3,	"text/troff",
	".ND",			"troff -ms input",	3,	"text/troff",
	".PP",			"troff -ms input",	3,	"text/troff",
	".TL",			"troff -ms input",	3,	"text/troff",
	".TR",			"troff -ms input",	3,	"text/troff",
	".TH",			"manual page",		3,	"text/troff",
	".\\\"",		"troff input",		3,	"text/troff",
	".de",			"troff input",		3,	"text/troff",
	".if",			"troff input",		3,	"text/troff",
	".nr",			"troff input",		3,	"text/troff",
	".tr",			"troff input",		3,	"text/troff",
	"vac:",			"venti score",		4,	PLAIN,
	"-----BEGIN CERTIFICATE-----\n",
				"pem certificate",	-1,	PLAIN,
	"-----BEGIN TRUSTED CERTIFICATE-----\n",
				"pem trusted certificate", -1,	PLAIN,
	"-----BEGIN X509 CERTIFICATE-----\n",
				"pem x.509 certificate", -1,	PLAIN,
	"subject=/C=",		"pem certificate with header", -1, PLAIN,
	"process snapshot ",	"process snapshot",	-1,	"application/snapfs",
	"d8:announce",		"torrent file",		11,	"application/x-bittorrent",
	"[playlist]",		"playlist",		10,	"application/x-scpls",
	"#EXTM3U",		"playlist",		7,	"audio/x-mpegurl",
	"BEGIN:VCARD\r\n",	"vCard",		13,	"text/directory;profile=vcard",
	"BEGIN:VCARD\n",	"vCard",		12,	"text/directory;profile=vcard",
	"AT&T",			"DjVu document",	4,	"image/vnd.djvu",
	"Extended module: ",	"XM audio",		17,	"audio/xm",
	"MThd",			"midi audio",		4,	"audio/midi",
	"MUS\x1a",		"mus audio",		4,	"audio/mus",
	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
	"\x00\x00\x00\xbb\x11\x22\x00\x44\xff\xff\xff\xff\xff\xff\xff\xff"
	"\xaa\x99\x55\x66", "Xilinx bitstream (not byteswappped)", 52, OCTET,
	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
	"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
	"\xbb\x00\x00\x00\x44\x00\x22\x11\xff\xff\xff\xff\xff\xff\xff\xff"
	"\x66\x55\x99\xaa", "Xilinx bitstream (byteswappped)", 52, OCTET,
	0,0,0,0
};

int
istring(void)
{
	int i, l;
	struct FILE_STRING *p;

	for(p = file_string; p->key; p++) {
		l = p->length;
		if(l == -1)
			l = strlen(p->key);
		if(nbuf >= l && memcmp(buf, p->key, l) == 0) {
			print("%s\n", mime ? p->mime : p->filetype);
			return 1;
		}
	}
	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
		for(i = 5; i < nbuf; i++)
			if(buf[i] == '\n')
				break;
		if(mime)
			print("%s\n", OCTET);
		else
			print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
		return 1;
	}
	return 0;
}

struct offstr
{
	ulong	off;
	struct FILE_STRING;
} offstrs[] = {
	32*1024, "\001CD001\001",	"ISO9660 CD image",	7,	"application/x-iso9660-image",
	32*4, "DICM",	"DICOM medical imaging data",	4,	"application/dicom",
	0, 0, 0, 0, 0
};

int
isoffstr(void)
{
	int n;
	char buf[256];
	struct offstr *p;

	for(p = offstrs; p->key; p++) {
		seek(fd, p->off, 0);
		n = p->length;
		if (n > sizeof buf)
			n = sizeof buf;
		if (readn(fd, buf, n) != n)
			continue;
		if(memcmp(buf, p->key, n) == 0) {
			print("%s\n", mime ? p->mime : p->filetype);
			return 1;
		}
	}
	return 0;
}

int
iff(void)
{
	if (strncmp((char*)buf, "FORM", 4) == 0 &&
	    strncmp((char*)buf+8, "AIFF", 4) == 0) {
		print("%s\n", mime? "audio/x-aiff": "aiff audio");
		return 1;
	}
	if (strncmp((char*)buf, "RIFF", 4) == 0) {
		if (strncmp((char*)buf+8, "WAVE", 4) == 0)
			print("%s\n", mime? "audio/wave": "wave audio");
		else if (strncmp((char*)buf+8, "AVI ", 4) == 0)
			print("%s\n", mime? "video/avi": "avi video");
		else
			print("%s\n", mime? OCTET : "riff file");
		return 1;
	}
	return 0;
}

char*	html_string[] = {
	"blockquote",
	"!DOCTYPE", "![CDATA[", "basefont", "frameset", "noframes", "textarea",
	"caption",
	"button", "center", "iframe", "object", "option", "script",
	"select", "strong",
	"blink", "embed", "frame", "input", "label", "param", "small",
	"style", "table", "tbody", "tfoot", "thead", "title",
	"?xml", "body", "code", "font", "form", "head", "html",
	"link", "menu", "meta", "span",
	"!--", "big", "dir", "div", "img", "pre", "sub", "sup",
	"br", "dd", "dl", "dt", "em", "h1", "h2", "h3", "h4", "h5",
	"h6", "hr", "li", "ol", "td", "th", "tr", "tt", "ul",
	"a", "b", "i", "p", "q", "u",
	0,
};

int
ishtml(void)
{
	int i, n, count;
	uchar *p;

	count = 0;
	p = buf;
	for(;;) {
		while(p < buf+nbuf && *p != '<')
			p++;
		p++;
		if (p >= buf+nbuf)
			break;
		if(*p == '/')
			p++;
		if(p >= buf+nbuf)
			break;
		for(i = 0; html_string[i]; i++){
			n = strlen(html_string[i]);
			if(p + n > buf+nbuf)
				continue;
			if(cistrncmp(html_string[i], (char*)p, n) == 0) {
				p += n;
				if(p < buf+nbuf && strchr("\t\r\n />", *p)){
					if(++count > 2) {
						print("%s\n", mime ? "text/html" : "HTML file");
						return 1;
					}
				}
				break;
			}
		}
	}
	return 0;
}

char*	rfc822_string[] =
{
	"from:",
	"date:",
	"to:",
	"subject:",
	"received:",
	"reply to:",
	"sender:",
	0,
};

int
isrfc822(void)
{

	char *p, *q, *r;
	int i, count;

	count = 0;
	p = (char*)buf;
	for(;;) {
		q = strchr(p, '\n');
		if(q == nil)
			break;
		*q = 0;
		if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
			count++;
			*q = '\n';
			p = q+1;
			continue;
		}
		*q = '\n';
		if(*p != '\t' && *p != ' '){
			r = strchr(p, ':');
			if(r == 0 || r > q)
				break;
			for(i = 0; rfc822_string[i]; i++) {
				if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
					count++;
					break;
				}
			}
		}
		p = q+1;
	}
	if(count >= 3){
		print("%s\n", mime ? "message/rfc822" : "email file");
		return 1;
	}
	return 0;
}

int
ismbox(void)
{
	char *p, *q;

	p = (char*)buf;
	q = strchr(p, '\n');
	if(q == nil)
		return 0;
	*q = 0;
	if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
		print("%s\n", mime ? PLAIN : "mail box");
		return 1;
	}
	*q = '\n';
	return 0;
}

int
iscint(void)
{
	int type;
	char *name;
	Biobuf b;

	if(Binit(&b, fd, OREAD) == Beof)
		return 0;
	seek(fd, 0, 0);
	type = objtype(&b, &name);
	if(type < 0)
		return 0;
	if(mime)
		print("%s\n", OCTET);
	else
		print("%s intermediate\n", name);
	return 1;
}

int
isc(void)
{
	int n;

	n = wfreq[I1];
	/*
	 * includes
	 */
	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
		goto yes;
	if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
		goto yes;
	/*
	 * declarations
	 */
	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
		goto yes;
	/*
	 * assignments
	 */
	if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
		goto yes;
	return 0;

yes:
	if(mime){
		print("%s\n", PLAIN);
		return 1;
	}
	if(wfreq[Alword] > 0)
		print("alef program\n");
	else
		print("c program\n");
	return 1;
}

int
islimbo(void)
{
	/*
	 * includes
	 */
	if(wfreq[Lword] < 4)
		return 0;
	print("%s\n", mime ? PLAIN : "limbo program");
	return 1;
}

int
isas(void)
{
	/*
	 * includes
	 */
	if(wfreq[Aword] < 2)
		return 0;
	print("%s\n", mime ? PLAIN : "as program");
	return 1;
}

int
istga(void)
{
	uchar *p;

	p = buf;
	if(nbuf < 18)
		return 0;
	if((p[12] | p[13]<<8) == 0)	/* width */
		return 0;
	if((p[14] | p[15]<<8) == 0)	/* height */
		return 0;
	if(p[16] != 8 && p[16] != 15 && p[16] != 16 && p[16] != 24 && p[16] != 32)	/* bpp */
		return 0;
	if(((p[2]|(1<<3)) & (~3)) != (1<<3))	/* rle flag */
		return 0;
	if(p[1] == 0){	/* non color-mapped */
		if((p[2]&3) != 2 && (p[2]&3) != 3)	
			return 0;
		if((p[5] | p[6]<<8) != 0)	/* palette length */
			return 0;
	} else
	if(p[1] == 1){	/* color-mapped */
		if((p[2]&3) != 1 || p[7] == 0)	
			return 0;
		if((p[5] | p[6]<<8) == 0)	/* palette length */
			return 0;
	} else
		return 0;
	print("%s\n", mime ? "image/tga" : "targa image");
	return 1;
}

int
ismp3(void)
{
	uchar *p, *e;

	p = buf;
	e = p + nbuf-1;
	while((p < e) && (p = memchr(p, 0xFF, e - p))){
		if((p[1] & 0xFE) == 0xFA){
			print("%s\n", mime ? "audio/mpeg" : "mp3 audio");
			return 1;
		}
		p++;
	}
	return 0;
}

/*
 * low entropy means encrypted
 */
int
ismung(void)
{
	int i, bucket[8];
	float cs;

	if(nbuf < 64)
		return 0;
	memset(bucket, 0, sizeof(bucket));
	for(i=nbuf-64; i<nbuf; i++)
		bucket[(buf[i]>>5)&07] += 1;

	cs = 0.;
	for(i=0; i<8; i++)
		cs += (bucket[i]-8)*(bucket[i]-8);
	cs /= 8.;
	if(cs <= 24.322) {
		if(buf[0]==0x1f && buf[1]==0x9d)
			print("%s\n", mime ? "application/x-compress" : "compressed");
		else
		if(buf[0]==0x1f && buf[1]==0x8b)
			print("%s\n", mime ? "application/x-gzip" : "gzip compressed");
		else
		if(buf[0]=='B' && buf[1]=='Z' && buf[2]=='h')
			print("%s\n", mime ? "application/x-bzip2" : "bzip2 compressed");
		else
		if(buf[0]==0x78 && buf[1]==0x9c)
			print("%s\n", mime ? "application/x-deflate" : "zlib compressed");
		else
			print("%s\n", mime ? OCTET : "encrypted");
		return 1;
	}
	return 0;
}

/*
 * english by punctuation and frequencies
 */
int
isenglish(void)
{
	int vow, comm, rare, badpun, punct;
	char *p;

	if(guess != Fascii && guess != Feascii)
		return 0;
	badpun = 0;
	punct = 0;
	for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
		switch(*p) {
		case '.':
		case ',':
		case ')':
		case '%':
		case ';':
		case ':':
		case '?':
			punct++;
			if(p[1] != ' ' && p[1] != '\n')
				badpun++;
		}
	if(badpun*5 > punct)
		return 0;
	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
		return 0;
	if(2*cfreq[';'] > cfreq['e'])
		return 0;

	vow = 0;
	for(p="AEIOU"; *p; p++) {
		vow += cfreq[*p];
		vow += cfreq[tolower(*p)];
	}
	comm = 0;
	for(p="ETAION"; *p; p++) {
		comm += cfreq[*p];
		comm += cfreq[tolower(*p)];
	}
	rare = 0;
	for(p="VJKQXZ"; *p; p++) {
		rare += cfreq[*p];
		rare += cfreq[tolower(*p)];
	}
	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
		print("%s\n", mime ? PLAIN : "English text");
		return 1;
	}
	return 0;
}

/*
 * pick up a number with
 * syntax _*[0-9]+_
 */
#define	P9BITLEN	12
int
p9bitnum(char *s, int *v)
{
	char *es;

	if(s[P9BITLEN-1] != ' ')
		return -1;
	s[P9BITLEN-1] = '\0';
	*v = strtol(s, &es, 10);
	s[P9BITLEN-1] = ' ';
	if(es != &s[P9BITLEN-1])
		return -1;
	return 0;
}

int
depthof(char *s, int *newp)
{
	char *es;
	int d;

	*newp = 0;
	es = s+12;
	while(s<es && *s==' ')
		s++;
	if(s == es)
		return -1;
	if('0'<=*s && *s<='9')
		return 1<<strtol(s, nil, 0);

	*newp = 1;
	d = 0;
	while(s<es && *s!=' '){
		if(strchr("rgbkamx", *s) == nil)
			return -1;
		s++;
		if('0'<=*s && *s<='9')
			d += strtoul(s, &s, 10);
		else
			return -1;
	}

	if(d % 8 == 0 || 8 % d == 0)
		return d;
	else
		return -1;
}

int
isp9bit(void)
{
	int dep, lox, loy, hix, hiy, px, new, cmpr;
	long len;
	char *newlabel;
	uchar *cp;

	cp = buf;
	cmpr = 0;
	if(memcmp(cp, "compressed\n", 11) == 0) {
		cmpr = 1;
		cp = buf + 11;
	}

	if((dep = depthof((char*)cp + 0*P9BITLEN, &new)) < 0)
		return 0;
	newlabel = new ? "" : "old ";
	if(p9bitnum((char*)cp + 1*P9BITLEN, &lox) < 0)
		return 0;
	if(p9bitnum((char*)cp + 2*P9BITLEN, &loy) < 0)
		return 0;
	if(p9bitnum((char*)cp + 3*P9BITLEN, &hix) < 0)
		return 0;
	if(p9bitnum((char*)cp + 4*P9BITLEN, &hiy) < 0)
		return 0;

	hix -= lox;
	hiy -= loy;
	if(hix <= 0 || hiy <= 0)
		return 0;

	if(dep < 8){
		px = 8/dep;		/* pixels per byte */
		/* set l to number of bytes of data per scan line */
		len = (hix+px-1)/px;
	}else
		len = hix*dep/8;
	len *= hiy;			/* col length */
	len += 5 * P9BITLEN;		/* size of initial ascii */

	/*
	 * for compressed images, don't look any further. otherwise:
	 * for image file, length is non-zero and must match calculation above.
	 * for /dev/window and /dev/screen the length is always zero.
	 * for subfont, the subfont header should follow immediately.
	 */
	if (cmpr) {
		print(mime ? "image/p9bit\n" : "Compressed %splan 9 image or subfont, depth %d, size %dx%d\n",
			newlabel, dep, hix, hiy);
		return 1;
	}
	/*
	 * mbuf->length == 0 probably indicates reading a pipe.
	 * Ghostscript sometimes produces a little extra on the end.
	 */
	if (len != 0 && (mbuf->length == 0 || mbuf->length == len ||
	    mbuf->length > len && mbuf->length < len+P9BITLEN)) {
		print(mime ? "image/p9bit\n" : "%splan 9 image, depth %d, size %dx%d\n",
			newlabel, dep, hix, hiy);
		return 1;
	}
	if (p9subfont(buf+len)) {
		print(mime ? "image/p9bit\n" : "%ssubfont file, depth %d, size %dx%d\n",
			newlabel, dep, hix, hiy);
		return 1;
	}
	return 0;
}

int
p9subfont(uchar *p)
{
	int n, h, a;

	/* if image too big, assume it's a subfont */
	if (p+3*P9BITLEN > buf+sizeof(buf))
		return 1;

	if (p9bitnum((char*)p + 0*P9BITLEN, &n) < 0)	/* char count */
		return 0;
	if (p9bitnum((char*)p + 1*P9BITLEN, &h) < 0)	/* height */
		return 0;
	if (p9bitnum((char*)p + 2*P9BITLEN, &a) < 0)	/* ascent */
		return 0;
	if(n > 0 && h > 0 && a >= 0)
		return 1;
	return 0;
}

#define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')

int
isp9font(void)
{
	uchar *cp, *p;
	int i, n;
	char pathname[1024];

	cp = buf;
	if (!getfontnum(cp, &cp))	/* height */
		return 0;
	if (!getfontnum(cp, &cp))	/* ascent */
		return 0;
	for (i = 0; cp=(uchar*)strchr((char*)cp, '\n'); i++) {
		if (!getfontnum(cp, &cp))	/* min */
			break;
		if (!getfontnum(cp, &cp))	/* max */
			return 0;
		getfontnum(cp, &cp);	/* optional offset */
		while (WHITESPACE(*cp))
			cp++;
		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
				;
			/* construct a path name, if needed */
		n = 0;
		if (*p != '/' && slash) {
			n = slash-fname+1;
			if (n < sizeof(pathname))
				memcpy(pathname, fname, n);
			else n = 0;
		}
		if (n+cp-p+4 < sizeof(pathname)) {
			memcpy(pathname+n, p, cp-p);
			n += cp-p;
			pathname[n] = 0;
			if (access(pathname, AEXIST) < 0) {
				strcpy(pathname+n, ".0");
				if (access(pathname, AEXIST) < 0)
					return 0;
			}
		}
	}
	if (i) {
		print("%s\n", mime ? PLAIN : "font file");
		return 1;
	}
	return 0;
}

int
getfontnum(uchar *cp, uchar **rp)
{
	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
		cp++;
	if (*cp < '0' || *cp > '9')
		return 0;
	strtoul((char *)cp, (char **)rp, 0);
	if (!WHITESPACE(**rp)) {
		*rp = cp;
		return 0;
	}
	return 1;
}

int
isrtf(void)
{
	if(strstr((char *)buf, "\\rtf1")){
		print(mime ? "application/rtf\n" : "rich text format\n");
		return 1;
	}
	return 0;
}

int
ismsdos(void)
{
	if (buf[0] == 0x4d && buf[1] == 0x5a){
		print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
		return 1;
	}
	return 0;
}

int
isicocur(void)
{
	if(buf[0] || buf[1] || buf[3] || buf[9])
		return 0;
	if(buf[4] == 0x00 && buf[5] == 0x00)
		return 0;
	switch(buf[2]){
	case 1:
		print(mime ? "image/x-icon\n" : "Microsoft icon file\n");
		return 1;
	case 2:
		print(mime ? "image/x-icon\n" : "Microsoft cursor file\n");
		return 1;
	}
	return 0;
}

int
iself(void)
{
	static char *cpu[] = {		/* NB: incomplete and arbitary list */
	[1]	"WE32100",
	[2]	"SPARC",
	[3]	"i386",
	[4]	"M68000",
	[5]	"M88000",
	[6]	"i486",
	[7]	"i860",
	[8]	"R3000",
	[9]	"S370",
	[10]	"R4000",
	[15]	"HP-PA",
	[18]	"sparc v8+",
	[19]	"i960",
	[20]	"PPC-32",
	[21]	"PPC-64",
	[40]	"ARM",
	[41]	"Alpha",
	[43]	"sparc v9",
	[50]	"IA-64",
	[62]	"AMD64",
	[75]	"VAX",
	};
	static char *type[] = {
	[1]	"relocatable object",
	[2]	"executable",
	[3]	"shared library",
	[4]	"core dump",
	};

	if (memcmp(buf, "\x7fELF", 4) == 0){
		if (!mime){
			int isdifend = 0;
			int n = (buf[19] << 8) | buf[18];
			char *p = "unknown";
			char *t = "unknown";

			if (n > 0 && n < nelem(cpu) && cpu[n])
				p = cpu[n];
			else {
				/* try the other byte order */
				isdifend = 1;
				n = (buf[18] << 8) | buf[19];
				if (n > 0 && n < nelem(cpu) && cpu[n])
					p = cpu[n];
			}
			if(isdifend)
				n = (buf[16]<< 8) | buf[17];
			else
				n = (buf[17]<< 8) | buf[16];

			if(n>0 && n < nelem(type) && type[n])
				t = type[n];
			print("%s ELF %s\n", p, t);
		}
		else
			print("application/x-elf-executable\n");
		return 1;
	}

	return 0;
}

int
isface(void)
{
	int i, j, ldepth, l;
	char *p;

	ldepth = -1;
	for(j = 0; j < 3; j++){
		for(p = (char*)buf, i=0; i<3; i++){
			if(p[0] != '0' || p[1] != 'x')
				return 0;
			if(buf[2+8] == ',')
				l = 2;
			else if(buf[2+4] == ',')
				l = 1;
			else
				return 0;
			if(ldepth == -1)
				ldepth = l;
			if(l != ldepth)
				return 0;
			strtoul(p, &p, 16);
			if(*p++ != ',')
				return 0;
			while(*p == ' ' || *p == '\t')
				p++;
		}
		if (*p++ != '\n')
			return 0;
	}

	if(mime)
		print("application/x-face\n");
	else
		print("face image depth %d\n", ldepth);
	return 1;
}