#! /usr/bin/env python # # Part of the A-A-P project: File type detection module # Copyright (C) 2002 Stichting NLnet Labs # Permission to copy and use this file is specified in the file COPYING. # If this file is missing you can find it here: http://www.a-a-p.org/COPYING # This module detects the type of a file. # It can be run as a separate program or called from Python. # Many types are recognized by default. More types can be added dynamically. # See filetype.txt for an explanation. # # # EXTERNAL INTERFACE: # # ft_detect(fname) Detects the type of file "fname". # # ft_check_dir(dir [, errmsg)] # Scan directory "dir" for "*.afd" files, which are # loaded with ft_read_file(). # # ft_read_file(fname) Read file "fname" for detection rules. # # ft_add_rules(str) Add file type detection rules from "str". See # "filetype.txt" for the syntax. # import string import os.path import re from Util import * # Set to non-zero when run as a program. _run_as_program = 0 # # The default list of detected file types. # NOTE: since "append" isn't used, the order of checking is last one first! # _def_detect_list = """ suffix c c suffix h cpp suffix hh cpp suffix H cpp suffix hxx cpp suffix hpp cpp suffix cpp cpp suffix cc cpp suffix C cpp suffix c++ cpp suffix cxx cpp suffix moc cpp suffix tcc cpp suffix inl cpp suffix py python suffix pl perl suffix sh sh suffix aap aap suffix afd afd suffix html html suffix htm html suffix Z ignore suffix gz ignore suffix bz2 ignore suffix bak ignore regexp .*enlightenment/.*.cfg$ c regexp .*vimrc$ vim regexp .*\\bconfigure$ sh script .*\\bpython python script .*\\bperl perl script .*csh\\b csh script .*\\bbash sh """ # List of _Ft_py objects: Python code executed to detect file type. # Used first. _py_list_before = [] # Dictionary used to map file name extension to file type. _suffix_dict = {} # List of _Ft_re objects; a match of the RE with the file name defines the file # type. _regexp_list = [] # List of _Ft_re objects: a match of the RE with the script in the first line # of the file defines the file type. _script_list = [] # List of _Ft_py objects: Python code executed to detect file type. # Used after everything else didn't detect the type. _py_list_after = [] _did_init = 0 # non-zero when __init__() did its work def __init__(): global _suffix_dict, _regexp_list, _script_list global _py_list_before, _py_list_after global _did_init # this only needs to be done once if _did_init: return _did_init = 1 _py_list_before = [] _suffix_dict = {} _regexp_list = [] _script_list = [] _py_list_after = [] # Load the built-in detection rules. ft_add_rules(_def_detect_list) # Load detection rules from system and user *.afd files. ft_check_dir("/usr/local/share/aap/afd") ft_check_dir(os.path.expanduser("~/.aap/afd")) class DetectError(Exception): """Error for something gone wrong.""" def __init__(self, args = None): self.args = args def ft_check_dir(dir, errmsg = 0): """Check directory "dir" for *.afd files and load them. When "errmsg" is non-zero give an error message when the directory doesn't exist.""" if os.path.exists(dir) and os.path.isdir(dir): for f in glob(os.path.join(dir, "*.afd")): try: ft_read_file(f) except DetectError, e: if _run_as_program: print str(e) else: from Message import msg_error msg_error(str(e)) elif errmsg: e = _('Directory does not exist: "%s"') % dir if _run_as_program: print e else: from Message import msg_error msg_error(e) def ft_read_file(fname): """Read file "fname" for file type detection rules.""" try: file = open(fname) except IOError, e: raise DetectError, (_('Cannot open "%s": ') % fname) + str(e) try: str = file.read() except IOError, e: raise DetectError, (_('Cannot read "%s": ') % fname) + str(e) file.close() ft_add_rules(str) def ft_add_rules(str): """Add file type detection rules from string "str".""" # Always load the default rules first (skipped when done already). __init__() # Split the string into individual lines. lines = string.split(str, '\n') # Loop over all the lines (may use more than one for python items). line_idx = 0 line_count = len(lines) while line_idx < line_count: line = lines[line_idx] line_len = len(line) # isolate first word: type of detection. ds = skip_white(line, 0) # detection start # ignore empty and comment lines if ds == line_len or line[ds] == '#': line_idx = line_idx + 1 continue de = skip_to_white(line, ds) # detection end item = line[ds:de] as = skip_white(line, de) # argument start # isolate first argument, which may be in quotes if as < line_len: if line[as] == '"' or line[as] == "'": quote = line[as] as = as + 1 ae = as while ae < line_len and line[ae] != quote: ae = ae + 1 if ae == line_len: raise DetectError, _('Missing quote in "%s"') % line n = ae + 1 else: ae = as while ae < line_len and line[ae] != ' ' and line[ae] != '\t': ae = ae + 1 n = ae arg1 = line[as:ae] n = skip_white(line, n) else: arg1 = '' n = line_len # Isolate further arguments (no quotes!). # A superfluous argument is silently ignore (could be a comment). args = string.split(line[n:]) if len(args) >= 1: arg2 = args[0] else: arg2 = '' if len(args) >= 2: arg3 = args[1] else: arg3 = '' if item == "suffix": if not arg2: raise DetectError, _('Missing argument in "%s"') % line _add_suffix(arg1, arg2) elif item == "regexp": if not arg2: raise DetectError, _('Missing argument in "%s"') % line _add_regexp(arg1, arg2, arg3 and arg3 == "append") elif item == "script": if not arg2: raise DetectError, _('Missing argument in "%s"') % line _add_script(arg1, arg2, arg3 and arg3 == "append") elif item == "python": append = 0 after = 0 for arg in [arg1, arg2]: if arg: if arg == "append": append = 1 elif arg == "after": after = 1 else: raise DetectError, _('Illegal argument in "%s"') % line start_indent = get_indent(line) line_idx = line_idx + 1 start_line_idx = line_idx cmds = "" while line_idx < line_len: line = lines[line_idx] if get_indent(line) <= start_indent: line_idx = line_idx - 1 # this line has next item break cmds = cmds + line + '\n' line_idx = line_idx + 1 if not cmds: raise DetectError, _('Python commands missing') _add_python(cmds, _("filetype detection line %d") % start_line_idx, after, append) else: raise (DetectError, _("Illegal item %s in argument to ft_add_rules()") % item) line_idx = line_idx + 1 class _Ft_re: """Class used to store pairs of RE and file type.""" def __init__(self, re, type): self.re = re self.type = type class _Ft_py: """Class used to store Python code for detecting a file type.""" def __init__(self, code, error_msg): self.code = code # the Python code self.error_msg = error_msg # ar message used for errors def _add_suffix(suf, type): """Add detection of "type" by file name extension "suf". When "type" is "ignore" it means the suffix is removed and further detection done on the rest. When "type" is "remove" an existing detection for "suf" is removed.""" if type == 'remove': if _suffix_dict.has_key(suf): del _suffix_dict[suf] else: _suffix_dict[suf] = type def _add_regexp(re, type, append): """Add detection of "type" by matching the file name with Python regular expression "re". When append is non-zero, add to the end of the regexp rules. When "type" is "remove" an existing detection for "re" is removed.""" if type == 'remove': for r in _regexp_list: if r.re == re: _regexp_list.remove(r) else: f = _Ft_re(re, type) if append: _regexp_list.append(f) else: _regexp_list.insert(0, f) def _add_script(re, type, append): """Add detection of "type" by matching the script name in the first line of the file with Python regular expression "re". When append is non-zero, add to the end of the script rules. When "type" is "remove" an existing detection for "re" is removed.""" if type == 'remove': for r in _script_list: if r.re == re: _script_list.remove(r) else: f = _Ft_re(re, type) if append: _script_list.append(f) else: _script_list.insert(0, f) def _add_python(code, error_msg, after, append): """Add detection of "type" by using Python code "code". Each line in "code" must end in a '\n'. "error_msg" is printed when executing the code results in an error. When "after" is non-zero use this rule after suffix, regexp and script rules. When append is non-zero, add to the end of the python rules.""" p = _Ft_py(code, error_msg) if after: list = _py_list_after else: list = _py_list_before if append: list.append(p) else: list.insert(0, p) def _exec_py(fname, item): """Execute the code defined with _add_python().""" # Make a completely fresh globals dictionary. new_globals = {"fname" : fname} # Prepend "if 1:" to get the indenting right. if item.code[0] == ' ' or item.code[0] == '\t': code = "if 1:\n" + item.code else: code = item.code try: exec code in new_globals, new_globals except StandardError, e: raise DetectError, _(item.error_msg) + str(e) if new_globals.has_key("type"): return new_globals["type"] return None def ft_detect(fname): """Detect the file type for file "fname". Returns the type as a string or None.""" # Initialize (will skip when done already) __init__() # On non-Posix systems we ignore case differences by making the name lower # case. if os.name != 'posix': fname = string.lower(fname) # Do the python code checks. for p in _py_list_before: type = _exec_py(fname, p) if type: return type # Try the extension, this is fastest. # When "fname" has several extensions, try with all of them first, then # try by removing the first ones: "f.html.c": "html.c" then ".c". bn = os.path.basename(fname) i = string.find(bn, ".") while i > 0 and i + 1 < len(bn): # Found a dot that's not the first or last character. if _suffix_dict.has_key(bn[i + 1:]): ft = _suffix_dict[bn[i + 1:]] if ft != "ignore": return ft # remove an ignored extension fname = fname[:-(len(bn[i:]))] bn = bn[:i] i = 0 i = string.find(bn, ".", i + 1) # match all defined REs with the file name. # TODO: handle "/" in RE and fname. for r in _regexp_list: if re.match(r.re, fname): return r.type # match all defined REs with the script name in the first line of the # file. try: f = open(fname) line = f.readline() f.close() except: # Errors for files that can't be read are ignored. pass else: if len(line) > 2 and line[:2] == "#!": # TODO: remove "env VAR=val" and script arguments from line for r in _script_list: if re.match(r.re, line[2:]): return r.type # Do the python code checks. for p in _py_list_after: type = _exec_py(fname, p) if type: return type return None # When executed as a program, detect the type of the specified file. if __name__ == '__main__': import sys # Internationalisation inits: setlocale and gettext. i18n_init() items = [] checkfile = None _run_as_program = 1 # Check for any "-Idir", "-I dir", "-ffile" and "-f file" arguments. next_is_dir = 0 next_is_file = 0 for arg in sys.argv[1:]: if next_is_dir: items.extend({"dir" : arg}) next_is_dir = 0 elif next_is_file: items.extend({"file" : arg}) next_is_file = 0 elif len(arg) >= 2 and arg[:2] == "-I": if len(arg) > 2: items.extend({"dir" : arg[2:]}) else: next_is_dir = 1 elif len(arg) >= 2 and arg[:2] == "-f": if len(arg) > 2: items.extend({"file" : arg[2:]}) else: next_is_file = 1 else: if checkfile: print _("Can only check one file") sys.exit(1) checkfile = arg if next_is_dir: print _("-I argument must be followed by a directory name") sys.exit(1) if next_is_file: print _("-f argument must be followed by a file name") sys.exit(1) if not checkfile: print _("Usage: %s [-I ruledir] [-f rulefile] filename") % sys.argv[0] sys.exit(1) # load the built-in default rules __init__() # Check specified directories for *.afd files and read specified files. for item in items: if item.has_key("dir"): ft_check_dir(item["dir"]) else: try: ft_read_file(item["file"]) except DetectError, e: print e print ft_detect(sys.argv[1]) # vim: set sw=4 sts=4 tw=79 fo+=l: .