From 05095851346f52c8e918176e8e2abdf0b21de5ec Mon Sep 17 00:00:00 2001 From: dwheeler Date: Fri, 7 Jul 2006 13:36:27 +0000 Subject: Initial import (sloccount 2.26) git-svn-id: svn://svn.code.sf.net/p/sloccount/code/trunk@1 d762cc98-fd17-0410-9a0d-d09172385bc5 --- break_filelist.orig | 1084 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1084 insertions(+) create mode 100755 break_filelist.orig (limited to 'break_filelist.orig') diff --git a/break_filelist.orig b/break_filelist.orig new file mode 100755 index 0000000..b34c702 --- /dev/null +++ b/break_filelist.orig @@ -0,0 +1,1084 @@ +#!/usr/bin/perl -w + +# break_filelist +# Take a list of dirs which contain a "filelist"; +# creates files in each directory identifying which are C, C++, Perl, etc. +# For example, "ansic.dat" lists all ANSI C files contained in filelist. +# Note: ".h" files are ambiguous (they could be C or C++); the program +# uses heuristics to determine this. +# The list of .h files is also contained in h_list.dat. + +# (C) Copyright 2000-2001 David A. Wheeler +# Part of "SLOCCount", and released under the GPL version 2; +# see the documentation for details. + +# If adding a new language: add the logic to open the file, +# close the file, and detect & write to the file listing that language. + +# Debatable decisions: +# Doesn't count .dsl files (stylesheets, which are partially LISP). +# Doesn't count .sql files (SQL queries & commands) + +# Note - I don't try to distinguish between TCL and [incr TCL] (itcl), +# an OO extended version of TCL. For our purposes, it's all TCL. + + +use FileHandle; + + +# Set default configuration: + +$duplicates_okay = 0; # Set to 1 if you want to count file duplicates. +$crossdups_okay = 0; # Set to 1 if duplicates okay in different filelists. +$autogen_okay = 0; # Set to 1 if you want to count autogen'ed files. +$noisy = 0; # Set to 1 if you want noisy reports. +%lang_list_files = (); + +# The following extensions are NOT code: +%not_code_extensions = ( + "html" => 1, + "in" => 1, # Debatable. + "xpm" => 1, + "po" => 1, + "am" => 1, # Debatable. + "1" => 1, # Man pages (documentation): + "2" => 1, + "3" => 1, + "4" => 1, + "5" => 1, + "6" => 1, + "7" => 1, + "8" => 1, + "9" => 1, + "n" => 1, + "gif" => 1, + "tfm" => 1, + "png" => 1, + "m4" => 1, # Debatable. + "bdf" => 1, + "sgml" => 1, + "mf" => 1, + "txt" => 1, + "man" => 1, + "xbm" => 1, + "Tag" => 1, + "sgm" => 1, + "vf" => 1, + "tex" => 1, + "elc" => 1, + "gz" => 1, + "dic" => 1, + "pfb" => 1, + "fig" => 1, + "afm" => 1, + "jpg" => 1, + "bmp" => 1, + "htm" => 1, + "kdelnk" => 1, + "desktop" => 1, + "pbm" => 1, + "pdf" => 1, + "ps" => 1, # Postscript is _USUALLY_ generated automatically. + "eps" => 1, + "doc" => 1, + "man" => 1, + "o" => 1, # Object code is generated from source code. + "a" => 1, # Static object code. + "so" => 1, # Dynamically-loaded object code. +); + +# The following filenames are NOT code: +%not_code_filenames = ( + "README" => 1, + "Readme" => 1, + "readme" => 1, + "README.tk" => 1, # used in kdemultimedia, it's confusing. + "Changelog" => 1, + "ChangeLog" => 1, + "Repository" => 1, + "CHANGES" => 1, + "Changes" => 1, + ".cvsignore" => 1, + "Root" => 1, # CVS. + "BUGS" => 1, + "TODO" => 1, + "COPYING" => 1, + "MAINTAINERS" => 1, + "Entries" => 1, + # Skip "iconfig.h" files; they're used in Imakefiles + # (used in xlockmore): + "iconfig.h" => 1, +); + + +# A filename ending in the following extensions usually maps to the +# given language: + +%file_extensions = ( + "c" => "ansic", + "ec" => "ansic", # Informix C. + "ecp" => "ansic", # Informix C. + "pgc" => "ansic", # Postgres embedded C/C++ (guess C) + "C" => "cpp", "cpp" => "cpp", "cxx" => "cpp", "cc" => "cpp", + "pcc" => "cpp", # Input to Oracle C++ preproc. + "m" => "objc", + "h" => "h", "H" => "h", "hpp" => "h", + "ada" => "ada", "adb" => "ada", "ads" => "ada", + "pad" => "ada", # Oracle Ada preprocessor. + "f" => "fortran", + "p" => "pascal", + "py" => "python", + "s" => "asm", "S" => "asm", "asm" => "asm", + "sh" => "sh", "bash" => "sh", + "csh" => "csh", "tcsh" => "csh", + "java" => "java", + "lisp" => "lisp", "el" => "lisp", "scm" => "lisp", "lsp" => "lisp", + "jl" => "lisp", + "tcl" => "tcl", "tk" => "tcl", "itk" => "tcl", + "exp" => "exp", + "pl" => "perl", "pm" => "perl", "perl" => "perl", + "awk" => "awk", + "sed" => "sed", + "y" => "yacc", + "l" => "lex", + "makefile" => "makefile", + "sql" => "sql", + "php" => "php", "php3" => "php", "php4" => "php", "php5" => "php", + "php6" => "php", + "inc" => "inc", # inc MAY be PHP - we'll handle it specially. + # ???: .pco is Oracle Cobol, need to add with a Cobol counter. +); + + +# GLOBAL VARIABLES + +$dup_count = 0; + +$warning_from_first_line = ""; + +%examined_directories = (); # Keys = Names of directories examined this run. + +$duplistfile = ""; + +########### + + +# Handle re-opening individual CODE_FILEs. +# CODE_FILE is public + +# Private value: +$opened_file_name = ""; + +sub reopen { + # Open file if it isn't already, else rewind. + # If filename is "", close any open file. + my $filename = shift; + chomp($filename); + # print("DEBUG: reopen($filename)\n"); + if ($filename eq "") { + if ($opened_file_name) {close(CODE_FILE);} + $opened_file_name = ""; + return; + } + if ($filename eq $opened_file_name) { + seek CODE_FILE, 0, 0; # Rewind. + } else { # We're opening a new file. + if ($opened_file_name) {close(CODE_FILE)} + open(CODE_FILE, "<$filename") || die "Can't open $filename"; + $opened_file_name = $filename; + } +} + +########### + +sub looks_like_cpp { + # returns a confidence level - does the file looks like it's C++? + my $filename = shift; + my $confidence = 0; + chomp($filename); + open( SUSPECT, "<$filename"); + while () { + if (m/^\s*class\b.*\{/) { # "}" + close(SUSPECT); + return 2; + } + if (m/^\s*class\b/) { + $confidence = 1; + } + } + close(SUSPECT); + return $confidence; +} + + +# Cache which files are objective-C or not. +# Key is the full file pathname; value is 1 if objective-C (else 0). +%objective_c_files = (); + +sub really_is_objc { +# Given filename, returns TRUE if its contents really are objective-C. + my $filename = shift; + chomp($filename); + + my $is_objc = 0; # Value to determine. + my $brace_lines = 0; # Lines that begin/end with curly braces. + my $plus_minus = 0; # Lines that begin with + or -. + my $word_main = 0; # Did we find "main("? + my $special = 0; # Did we find a special Objective-C pattern? + + # Return cached result, if available: + if ($objective_c_files{$filename}) { return $objective_c_files{$filename};} + + open(OBJC_FILE, "<$filename") || + die "Can't open $filename to determine if it's objective C.\n"; + while() { + + if (m/^\s*[{}]/ || m/[{}];?\s*$/) { $brace_lines++;} + if (m/^\s*[+-]/) {$plus_minus++;} + if (m/\bmain\s*\(/) {$word_main++;} # "main" followed by "("? + # Handle /usr/src/redhat/BUILD/egcs-1.1.2/gcc/objc/linking.m: + if (m/^\s*\[object name\];\s*$/i) {$special=1;} + } + close(OBJC_FILE); + + if (($brace_lines > 1) && (($plus_minus > 1) || $word_main || $special)) + {$is_objc = 1;} + + $objective_c_files{$filename} = $is_objc; # Store result in cache. + + return $is_objc; +} + + +# Cache which files are lex or not. +# Key is the full file pathname; value is 1 if lex (else 0). +%lex_files = (); + +sub really_is_lex { +# Given filename, returns TRUE if its contents really is lex. +# lex file must have "%%", "%{", and "%}". +# In theory, a lex file doesn't need "%{" and "%}", but in practice +# they all have them, and requiring them avoid mislabeling a +# non-lexfile as a lex file. + + my $filename = shift; + chomp($filename); + + my $is_lex = 0; # Value to determine. + my $percent_percent = 0; + my $percent_opencurly = 0; + my $percent_closecurly = 0; + + # Return cached result, if available: + if ($lex_files{$filename}) { return $lex_files{$filename};} + + open(LEX_FILE, "<$filename") || + die "Can't open $filename to determine if it's lex.\n"; + while() { + $percent_percent++ if (m/^\s*\%\%/); + $percent_opencurly++ if (m/^\s*\%\{/); + $percent_closecurly++ if (m/^\s*\%\}/); + } + close(LEX_FILE); + + if ($percent_percent && $percent_opencurly && $percent_closecurly) + {$is_lex = 1;} + + $lex_files{$filename} = $is_lex; # Store result in cache. + + return $is_lex; +} + + +# Cache which files are expect or not. +# Key is the full file pathname; value is 1 if it is (else 0). +%expect_files = (); + +sub really_is_expect { +# Given filename, returns TRUE if its contents really are Expect. +# Many "exp" files (such as in Apache and Mesa) are just "export" data, +# summarizing something else # (e.g., its interface). +# Sometimes (like in RPM) it's just misc. data. +# Thus, we need to look at the file to determine +# if it's really an "expect" file. + + my $filename = shift; + chomp($filename); + +# The heuristic is as follows: it's Expect _IF_ it: +# 1. has "load_lib" command and either "#" comments or {}. +# 2. {, }, and one of: proc, if, [...], expect + + my $is_expect = 0; # Value to determine. + + my $begin_brace = 0; # Lines that begin with curly braces. + my $end_brace = 0; # Lines that begin with curly braces. + my $load_lib = 0; # Lines with the Load_lib command. + my $found_proc = 0; + my $found_if = 0; + my $found_brackets = 0; + my $found_expect = 0; + my $found_pound = 0; + + # Return cached result, if available: + if ($expect_files{$filename}) { return expect_files{$filename};} + + open(EXPECT_FILE, "<$filename") || + die "Can't open $filename to determine if it's expect.\n"; + while() { + + if (m/#/) {$found_pound++; s/#.*//;} + if (m/^\s*\{/) { $begin_brace++;} + if (m/\{\s*$/) { $begin_brace++;} + if (m/^\s*\}/) { $end_brace++;} + if (m/\};?\s*$/) { $end_brace++;} + if (m/^\s*load_lib\s+\S/) { $load_lib++;} + if (m/^\s*proc\s/) { $found_proc++;} + if (m/^\s*if\s/) { $found_if++;} + if (m/\[.*\]/) { $found_brackets++;} + if (m/^\s*expect\s/) { $found_expect++;} + } + close(EXPECT_FILE); + + if ($load_lib && ($found_pound || ($begin_brace && $end_brace))) + {$is_expect = 1;} + if ( $begin_brace && $end_brace && + ($found_proc || $found_if || $found_brackets || $found_expect)) + {$is_expect = 1;} + + $expect_files{$filename} = $is_expect; # Store result in cache. + + return $is_expect; +} + + +# Cached values. +%pascal_files = (); + +sub really_is_pascal { +# Given filename, returns TRUE if its contents really are Pascal. + +# This isn't as obvious as it seems. +# Many ".p" files are Perl files +# (such as /usr/src/redhat/BUILD/ispell-3.1/dicts/czech/glob.p), +# others are C extractions +# (such as /usr/src/redhat/BUILD/linux/include/linux/umsdos_fs.p +# and some files in linuxconf). +# However, test files in "p2c" really are Pascal, for example. + +# Note that /usr/src/redhat/BUILD/ucd-snmp-4.1.1/ov/bitmaps/UCD.20.p +# is actually C code. The heuristics determine that they're not Pascal, +# but because it ends in ".p" it's not counted as C code either. +# I believe this is actually correct behavior, because frankly it +# looks like it's automatically generated (it's a bitmap expressed as code). +# Rather than guess otherwise, we don't include it in a list of +# source files. Let's face it, someone who creates C files ending in ".p" +# and expects them to be counted by default as C files in SLOCCount needs +# their head examined. I suggest examining their head +# with a sucker rod (see syslogd(8) for more on sucker rods). + +# This heuristic counts as Pascal such files such as: +# /usr/src/redhat/BUILD/teTeX-1.0/texk/web2c/tangleboot.p +# Which is hand-generated. We don't count woven documents now anyway, +# so this is justifiable. + + my $filename = shift; + chomp($filename); + +# The heuristic is as follows: it's Pascal _IF_ it has all of the following: +# 1. "^..program NAME(...);" or "..unit NAME". +# 2. "procedure", "function", "^..interface", or "^..implementation" +# 3. a "begin", and +# 4. it ends with "end." (ignoring {...} comments). +# The last requirement in particular filters out non-Pascal. + + + my $is_pascal = 0; # Value to determine. + + my $has_program = 0; + my $has_unit = 0; + my $has_procedure_or_function = 0; + my $found_begin = 0; + my $found_terminating_end = 0; + + # Return cached result, if available: + if ($pascal_files{$filename}) { return pascal_files{$filename};} + + open(PASCAL_FILE, "<$filename") || + die "Can't open $filename to determine if it's pascal.\n"; + while() { + if (m/\bprogram\s+[A-Za-z]/i) {$has_program=1;} + if (m/\bunit\s+[A-Za-z]/i) {$has_unit=1;} + if (m/\bprocedure\b/i) { $has_procedure_or_function = 1; } + if (m/\bfunction\b/i) { $has_procedure_or_function = 1; } + if (m/^\s*interface\s+/i) { $has_procedure_or_function = 1; } + if (m/^\s*implementation\s+/i) { $has_procedure_or_function = 1; } + if (m/\bbegin\b/i) { $has_begin = 1; } + s/\{.*?\}//g; # Ignore comments on this line; imperfect, but effective. + # This heuristic fails if there are multi-line comments after + # "end."; I haven't seen that in real Pascal programs: + if (m/end\.\s*$/i) {$found_terminating_end = 1;} + elsif (m/\S/) {$found_terminating_end = 0;} + } + close(PASCAL_FILE); + + # Okay, we've examined the entire file looking for clues; + # let's use those clues to determine if it's really Pascal: + + if ( ($has_unit || $has_program) && $has_procedure_or_function && + $has_begin && $found_terminating_end) + {$is_pascal = 1;} + + $pascal_files{$filename} = $is_pascal; # Store result in cache. + + return $is_pascal; +} + +# Cache which files are php or not. +# Key is the full file pathname; value is 1 if it is (else 0). +%php_files = (); + +sub really_is_php { +# Given filename, returns TRUE if its contents really is php. + + my $filename = shift; + chomp($filename); + + my $is_php = 0; # Value to determine. + # Need to find a matching pair of surrounds, with ending after beginning: + my $normal_surround = 0; # + my $script_surround = 0; # ; bit 0 =