ocolumn.ml
1: (* ocolumn: a version of *BSD's column(1) that doesn't dump core on long lines
2: Keith Waclena <http://www.lib.uchicago.edu/keith/>
3: *)
4:
5: open Utils
6: open Printf
7:
8: (* "for programs that run for a short time but allocate like crazy, *)
9: (* the default value of Gc.space_overhead is a little too low and causes *)
10: (* the garbage collector to work unnecessarily hard" *)
11: let _ = Gc.set { (Gc.get()) with Gc.space_overhead = 100 }
12:
13: (* print usage message *)
14: let usage = sprintf "Usage: %s [-i STR] [-m] [-o STR] [-r REGEXP] [-s STR] [-t] [--] file ..."
15:
16: (* cmdline parameters and related *)
17:
18: let isep = ref None (* input field separator *)
19: let merge = ref false (* merge adjacent empty fields *)
20: let osep = ref " " (* output field separator *)
21: let regexp = ref (Some "\t") (* regular expression split *)
22: let compiled = ref (Pcre.regexp "") (* compiled version of !regexp *)
23: let names = ref [] (* filenames *)
24: let sepopts = ref 0
25:
26: let get = function
27: | Some x -> x
28: | None -> raise Not_found
29:
30: (* speclist for Arg.parse cmdline parsing *)
31: let speclist = [
32: ("-i", Arg.String (fun s -> isep := Some s; regexp := None; incr sepopts),
33: "STR: input field separator (default: '%s')");
34: ("-m", Arg.Unit (fun () -> merge := true),
35: (sprintf ": merge adjacent empty fields (like awk; default: %s)" (if !merge then "true" else "false")));
36: ("-o", Arg.String (fun s -> osep := s),
37: (sprintf "STR: output field separator (default: '%s')" (String.escaped !osep)));
38: ("-r", Arg.String (fun s -> regexp := Some s; isep := None; incr sepopts),
39: "REGEXP: field separator as regular expression");
40: ("-s", Arg.String (fun s -> isep := Some s; osep := s; regexp := None; incr sepopts),
41: "STR: field separator (sets -i and -o)");
42: ("-t", Arg.Unit (fun () -> ()),
43: ": noop for compatibility with BSD column");
44: ("--", Arg.Rest (fun name -> names := !names @ [name]),
45: ": stop interpreting options");
46: ]
47:
48: (* return maximum int in a list of ints *)
49: let maxwidth = List.fold_left max 0
50:
51: (* our most important data structure is an int list list, representing *)
52: (* a matrix of column widths; the matrix must be rectangular. maxcols *)
53: (* takes such a matrix and reduces it to an int list indicating the *)
54: (* max width in each column of the matrix. if the matrix has N *)
55: (* rows and M columns, the result of maxcols is a list of length M *)
56: let maxcols = (List.map maxwidth) & transpose
57:
58: let splitter str =
59: match !regexp with
60: | Some r -> Pcre.split ~rex:!compiled ~max:(-1) str
61: | None -> split ~merge:!merge (get !isep) str
62:
63: (* columnate files in argv according to widths in widths
64: widths is a list of integer max column widths (such as returned by maxcols)
65: argv is a list of filenames
66: *)
67: let columnate widths argv =
68: (* columnate (as above) one file (open on chan) *)
69: let columnatechan chan =
70: (* given list of strings (ie fields) widen each one and join into a
71: string, trimming trailing spaces *)
72: let widenfields fields =
73: (* pad s out to n chars with spaces *)
74: let widen n s =
75: let len = String.length s in
76: assert (len <= n);
77: s ^ (String.make (n-len) ' ')
78: in
79: trimright ' ' (String.concat !osep (List.map2 widen widths fields))
80: in
81: (* widen fields of a line and print result *)
82: let widenprint line = print_endline (widenfields (splitter line))
83: in
84: iterlines widenprint chan
85: in
86: List.iter (with_open_in_file columnatechan) argv
87:
88: (* return widths of fields in line as list of ints *)
89: let linewidths len line =
90: let fields = List.map String.length (splitter line) in
91: match len with
92: | Some n ->
93: if List.length fields <> n
94: then
95: let err = (sprintf "non-rectangular data (NOT all lines have %d fields)" n) in
96: raise (Invalid_argument err)
97: else fields
98: | None -> fields
99:
100: (* return an int list list of all "linewidths" in chan. a *)
101: (* "linewidth" is an int list representing the widths of all the *)
102: (* fields in a given line; a list of such linewidths represents the *)
103: (* column widths of all the lines in a file (open on chan) *)
104: let getlinewidths chan =
105: let widths1 = linewidths None (input_line chan) in
106: let n = List.length widths1 in
107: widths1::(maplines (linewidths (Some n)) chan)
108:
109: (* get max widths of columns in file open on chan as list of ints *)
110: let getcolwidths = maxcols & getlinewidths
111:
112: (* get max widths across all named files *)
113: let getfilewidths argv = maxcols (List.map (with_open_in_file getcolwidths) argv)
114:
115: (* check command line options *)
116: let checkopts () =
117: match !regexp with
118: | Some r ->
119: if !merge then raise (Arg.Bad "-m and -r are incompatible");
120: if !sepopts > 1 then raise (Arg.Bad "-r is incompatible with -i and -s");
121: compiled := Pcre.regexp r;
122: ()
123: | None ->
124: ()
125:
126: (* main routine: handle cmdline options and args, and columnate *)
127: let main () =
128: let collect name = names := !names @ [name] in
129: let msg = (usage (basename Sys.argv.(0))) in
130: let _ = Arg.parse speclist collect msg in
131: let argv = (tmpstdin (if !names = [] then ["-"] else !names)) in
132: checkopts ();
133: columnate (getfilewidths argv) argv
134:
135: let _ = main ()
136:
This document was generated using caml2html