ongrep

A cleaned up fork of ngrep for OpenBSD
git clone git://git.sgregoratto.me/ongrep
Log | Files | Refs | README | LICENSE

commit 28077a19eb535d2e702bf68e2a49de6bf21b701a
parent 82044ad6c6d43ef250e2796e0266c03e95de7c6c
Author: Jordan Ritter <jpr5@darkridge.com>
Date:   Sat, 24 Feb 2001 22:04:55 +0000

Import of PCRE 3.4 and GNU regex 0.12.

Diffstat:
Aregex-0.12/AUTHORS | 10++++++++++
Aregex-0.12/COPYING | 339+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/ChangeLog | 3030+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/INSTALL | 117+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/Makefile.in | 98+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/NEWS | 62++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/README | 60++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/configure | 462+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/configure.in | 23+++++++++++++++++++++++
Aregex-0.12/doc/Makefile.in | 92+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/doc/include.awk | 19+++++++++++++++++++
Aregex-0.12/doc/regex.aux | 136+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/doc/regex.cps | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/doc/regex.info | 2836+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/doc/regex.texi | 3138+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/doc/texinfo.tex | 3941+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/doc/xregex.texi | 3021+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/regex.c | 4948+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/regex.h | 490+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/ChangeLog | 77+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/Makefile.in | 168+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/TAGS | 373+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/alloca.c | 194+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/bsd-interf.c | 38++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/debugmalloc.c | 273+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/emacsmalloc.c | 844+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/fileregex.c | 77+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/g++malloc.c | 1288+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/getpagesize.h | 25+++++++++++++++++++++++++
Aregex-0.12/test/iregex.c | 164+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/main.c | 49+++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/malloc-test.c | 47+++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/other.c | 503+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/printchar.c | 14++++++++++++++
Aregex-0.12/test/psx-basic.c | 253+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/psx-extend.c | 1244+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/psx-generic.c | 336+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/psx-group.c | 440+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/psx-interf.c | 624+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/psx-interv.c | 140+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/regexcpp.sed | 8++++++++
Aregex-0.12/test/syntax.skel | 74++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/test.c | 782+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/test.h | 141+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/tregress.c | 464+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/upcase.c | 39+++++++++++++++++++++++++++++++++++++++
Aregex-0.12/test/xmalloc.c | 21+++++++++++++++++++++
47 files changed, 31674 insertions(+), 0 deletions(-)

diff --git a/regex-0.12/AUTHORS b/regex-0.12/AUTHORS @@ -0,0 +1,10 @@ +Richard Stallman -- original version and continuing revisions of + regex.c and regex.h, and original version of the documentation. + +Karl Berry and Kathryn Hargreaves -- extensive modifications to above, + and all test files. + +Jim Blandy -- original version of re_set_registers, revisions to regex.c. + +Joe Arceneaux, David MacKenzie, Mike Haertel, Charles Hannum, and +probably others -- revisions to regex.c. diff --git a/regex-0.12/COPYING b/regex-0.12/COPYING @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 675 Mass Ave, Cambridge, MA 02139, USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + Appendix: How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) 19yy <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/regex-0.12/ChangeLog b/regex-0.12/ChangeLog @@ -0,0 +1,3030 @@ +Fri Apr 2 17:31:59 1993 Jim Blandy (jimb@totoro.cs.oberlin.edu) + + * Released version 0.12. + + * regex.c (regerror): If errcode is zero, that's not a valid + error code, according to POSIX, but return "Success." + + * regex.c (regerror): Remember to actually fetch the message + from re_error_msg. + + * regex.c (regex_compile): Don't use the trick for ".*\n" on + ".+\n". Since the latter involves laying an extra choice + point, the backward jump isn't adjusted properly. + +Thu Mar 25 21:35:18 1993 Jim Blandy (jimb@totoro.cs.oberlin.edu) + + * regex.c (regex_compile): In the handle_open and handle_close + sections, clear pending_exact to zero. + +Tue Mar 9 12:03:07 1993 Jim Blandy (jimb@wookumz.gnu.ai.mit.edu) + + * regex.c (re_search_2): In the loop which searches forward + using fastmap, don't forget to cast the character from the + string to an unsigned before using it as an index into the + translate map. + +Thu Jan 14 15:41:46 1993 David J. MacKenzie (djm@kropotkin.gnu.ai.mit.edu) + + * regex.h: Never define const; let the callers do it. + configure.in: Don't define USING_AUTOCONF. + +Wed Jan 6 20:49:29 1993 Jim Blandy (jimb@geech.gnu.ai.mit.edu) + + * regex.c (regerror): Abort if ERRCODE is out of range. + +Sun Dec 20 16:19:10 1992 Jim Blandy (jimb@totoro.cs.oberlin.edu) + + * configure.in: Arrange to #define USING_AUTOCONF. + * regex.h: If USING_AUTOCONF is #defined, don't mess with + `const' at all; autoconf has taken care of it. + +Mon Dec 14 21:40:39 1992 David J. MacKenzie (djm@kropotkin.gnu.ai.mit.edu) + + * regex.h (RE_SYNTAX_AWK): Fix typo. From Arnold Robbins. + +Sun Dec 13 20:35:39 1992 Jim Blandy (jimb@totoro.cs.oberlin.edu) + + * regex.c (compile_range): Fetch the range start and end by + casting the pattern pointer to an `unsigned char *' before + fetching through it. + +Sat Dec 12 09:41:01 1992 Jim Blandy (jimb@totoro.cs.oberlin.edu) + + * regex.c: Undo change of 12/7/92; it's better for Emacs to + #define HAVE_CONFIG_H. + +Fri Dec 11 22:00:34 1992 Jim Meyering (meyering@hal.gnu.ai.mit.edu) + + * regex.c: Define and use isascii-protected ctype.h macros. + +Fri Dec 11 05:10:38 1992 Jim Blandy (jimb@totoro.cs.oberlin.edu) + + * regex.c (re_match_2): Undo Karl's November 10th change; it + keeps the group in :\(.*\) from matching :/ properly. + +Mon Dec 7 19:44:56 1992 Jim Blandy (jimb@wookumz.gnu.ai.mit.edu) + + * regex.c: #include config.h if either HAVE_CONFIG_H or emacs + is #defined. + +Tue Dec 1 13:33:17 1992 David J. MacKenzie (djm@goldman.gnu.ai.mit.edu) + + * regex.c [HAVE_CONFIG_H]: Include config.h. + +Wed Nov 25 23:46:02 1992 David J. MacKenzie (djm@goldman.gnu.ai.mit.edu) + + * regex.c (regcomp): Add parens around bitwise & for clarity. + Initialize preg->allocated to prevent segv. + +Tue Nov 24 09:22:29 1992 David J. MacKenzie (djm@goldman.gnu.ai.mit.edu) + + * regex.c: Use HAVE_STRING_H, not USG. + * configure.in: Check for string.h, not USG. + +Fri Nov 20 06:33:24 1992 Karl Berry (karl@cs.umb.edu) + + * regex.c (SIGN_EXTEND_CHAR) [VMS]: Back out of this change, + since Roland Roberts now says it was a localism. + +Mon Nov 16 07:01:36 1992 Karl Berry (karl@cs.umb.edu) + + * regex.h (const) [!HAVE_CONST]: Test another cpp symbol (from + Autoconf) before zapping const. + +Sun Nov 15 05:36:42 1992 Jim Blandy (jimb@wookumz.gnu.ai.mit.edu) + + * regex.c, regex.h: Changes for VMS from Roland B Roberts + <roberts@nsrl31.nsrl.rochester.edu>. + +Thu Nov 12 11:31:15 1992 Karl Berry (karl@cs.umb.edu) + + * Makefile.in (distfiles): Include INSTALL. + +Tue Nov 10 09:29:23 1992 Karl Berry (karl@cs.umb.edu) + + * regex.c (re_match_2): At maybe_pop_jump, if at end of string + and pattern, just quit the matching loop. + + * regex.c (LETTER_P): Rename to `WORDCHAR_P'. + + * regex.c (AT_STRINGS_{BEG,END}): Take `d' as an arg; change + callers. + + * regex.c (re_match_2) [!emacs]: In wordchar and notwordchar + cases, advance d. + +Wed Nov 4 15:43:58 1992 Karl Berry (karl@hal.gnu.ai.mit.edu) + + * regex.h (const) [!__STDC__]: Don't define if it's already defined. + +Sat Oct 17 19:28:19 1992 Karl Berry (karl@cs.umb.edu) + + * regex.c (bcmp, bcopy, bzero): Only #define if they are not + already #defined. + + * configure.in: Use AC_CONST. + +Thu Oct 15 08:39:06 1992 Karl Berry (karl@cs.umb.edu) + + * regex.h (const) [!const]: Conditionalize. + +Fri Oct 2 13:31:42 1992 Karl Berry (karl@cs.umb.edu) + + * regex.h (RE_SYNTAX_ED): New definition. + +Sun Sep 20 12:53:39 1992 Karl Berry (karl@cs.umb.edu) + + * regex.[ch]: remove traces of `longest_p' -- dumb idea to put + this into the pattern buffer, as it means parallelism loses. + + * Makefile.in (config.status): use sh to run configure --no-create. + + * Makefile.in (realclean): OK, don't remove configure. + +Sat Sep 19 09:05:08 1992 Karl Berry (karl@hayley) + + * regex.c (PUSH_FAILURE_POINT, POP_FAILURE_POINT) [DEBUG]: keep + track of how many failure points we push and pop. + (re_match_2) [DEBUG]: declare variables for that, and print results. + (DEBUG_PRINT4): new macro. + + * regex.h (re_pattern_buffer): new field `longest_p' (to + eliminate backtracking if the user doesn't need it). + * regex.c (re_compile_pattern): initialize it (to 1). + (re_search_2): set it to zero if register information is not needed. + (re_match_2): if it's set, don't backtrack. + + * regex.c (re_search_2): update fastmap only after checking that + the pattern is anchored. + + * regex.c (re_match_2): do more debugging at maybe_pop_jump. + + * regex.c (re_search_2): cast result of TRANSLATE for use in + array subscript. + +Thu Sep 17 19:47:16 1992 Karl Berry (karl@geech.gnu.ai.mit.edu) + + * Version 0.11. + +Wed Sep 16 08:17:10 1992 Karl Berry (karl@hayley) + + * regex.c (INIT_FAIL_STACK): rewrite as statements instead of a + complicated comma expr, to avoid compiler warnings (and also + simplify). + (re_compile_fastmap, re_match_2): change callers. + + * regex.c (POP_FAILURE_POINT): cast pop of regstart and regend + to avoid compiler warnings. + + * regex.h (RE_NEWLINE_ORDINARY): remove this syntax bit, and + remove uses. + * regex.c (at_{beg,end}line_loc_p): go the last mile: remove + the RE_NEWLINE_ORDINARY case which made the ^ in \n^ be an anchor. + +Tue Sep 15 09:55:29 1992 Karl Berry (karl@hayley) + + * regex.c (at_begline_loc_p): new fn. + (at_endline_loc_p): simplify at_endline_op_p. + (regex_compile): in ^/$ cases, call the above. + + * regex.c (POP_FAILURE_POINT): rewrite the fn as a macro again, + as lord's profiling indicates the function is 20% of the time. + (re_match_2): callers changed. + + * configure.in (AC_MEMORY_H): remove, since we never use memcpy et al. + +Mon Sep 14 17:49:27 1992 Karl Berry (karl@hayley) + + * Makefile.in (makeargs): include MFLAGS. + +Sun Sep 13 07:41:45 1992 Karl Berry (karl@hayley) + + * regex.c (regex_compile): in \1..\9 case, make it always + invalid to use \<digit> if there is no preceding <digit>th subexpr. + * regex.h (RE_NO_MISSING_BK_REF): remove this syntax bit. + + * regex.c (regex_compile): remove support for invalid empty groups. + * regex.h (RE_NO_EMPTY_GROUPS): remove this syntax bit. + + * regex.c (FREE_VARIABLES) [!REGEX_MALLOC]: define as alloca (0), + to reclaim memory. + + * regex.h (RE_SYNTAX_POSIX_SED): don't bother with this. + +Sat Sep 12 13:37:21 1992 Karl Berry (karl@hayley) + + * README: incorporate emacs.diff. + + * regex.h (_RE_ARGS) [!__STDC__]: define as empty parens. + + * configure.in: add AC_ALLOCA. + + * Put test files in subdir test, documentation in subdir doc. + Adjust Makefile.in and configure.in accordingly. + +Thu Sep 10 10:29:11 1992 Karl Berry (karl@hayley) + + * regex.h (RE_SYNTAX_{POSIX_,}SED): new definitions. + +Wed Sep 9 06:27:09 1992 Karl Berry (karl@hayley) + + * Version 0.10. + +Tue Sep 8 07:32:30 1992 Karl Berry (karl@hayley) + + * xregex.texinfo: put the day of month into the date. + + * Makefile.in (realclean): remove Texinfo-generated files. + (distclean): remove empty sorted index files. + (clean): remove dvi files, etc. + + * configure.in: test for more Unix variants. + + * fileregex.c: new file. + Makefile.in (fileregex): new target. + + * iregex.c (main): move variable decls to smallest scope. + + * regex.c (FREE_VARIABLES): free reg_{,info_}dummy. + (re_match_2): check that the allocation for those two succeeded. + + * regex.c (FREE_VAR): replace FREE_NONNULL with this. + (FREE_VARIABLES): call it. + (re_match_2) [REGEX_MALLOC]: initialize all our vars to NULL. + + * tregress.c (do_match): generalize simple_match. + (SIMPLE_NONMATCH): new macro. + (SIMPLE_MATCH): change from routine. + + * Makefile.in (regex.texinfo): make file readonly, so we don't + edit it by mistake. + + * many files (re_default_syntax): rename to `re_syntax_options'; + call re_set_syntax instead of assigning to the variable where + possible. + +Mon Sep 7 10:12:16 1992 Karl Berry (karl@hayley) + + * syntax.skel: don't use prototypes. + + * {configure,Makefile}.in: new files. + + * regex.c: include <string.h> `#if USG || STDC_HEADERS'; remove + obsolete test for `POSIX', and test for BSRTING. + Include <strings.h> if we are not USG or STDC_HEADERS. + Do not include <unistd.h>. What did we ever need that for? + + * regex.h (RE_NO_EMPTY_ALTS): remove this. + (RE_SYNTAX_AWK): remove from here, too. + * regex.c (regex_compile): remove the check. + * xregex.texinfo (Alternation Operator): update. + * other.c (test_others): remove tests for this. + + * regex.h (RE_DUP_MAX): undefine if already defined. + + * regex.h: (RE_SYNTAX_POSIX*): redo to allow more operators, and + define new syntaxes with the minimal set. + + * syntax.skel (main): used sscanf instead of scanf. + + * regex.h (RE_SYNTAX_*GREP): new definitions from mike. + + * regex.c (regex_compile): initialize the upper bound of + intervals at the beginning of the interval, not the end. + (From pclink@qld.tne.oz.au.) + + * regex.c (handle_bar): rename to `handle_alt', for consistency. + + * regex.c ({store,insert}_{op1,op2}): new routines (except the last). + ({STORE,INSERT}_JUMP{,2}): macros to replace the old routines, + which took arguments in different orders, and were generally weird. + + * regex.c (PAT_PUSH*): rename to `BUF_PUSH*' -- we're not + appending info to the pattern! + +Sun Sep 6 11:26:49 1992 Karl Berry (karl@hayley) + + * regex.c (regex_compile): delete the variable + `following_left_brace', since we never use it. + + * regex.c (print_compiled_pattern): don't print the fastmap if + it's null. + + * regex.c (re_compile_fastmap): handle + `on_failure_keep_string_jump' like `on_failure_jump'. + + * regex.c (re_match_2): in `charset{,_not' case, cast the bit + count to unsigned, not unsigned char, in case we have a full + 32-byte bit list. + + * tregress.c (simple_match): remove. + (simple_test): rename as `simple_match'. + (simple_compile): print the error string if the compile failed. + + * regex.c (DO_RANGE): rewrite as a function, `compile_range', so + we can debug it. Change pattern characters to unsigned char + *'s, and change the range variable to an unsigned. + (regex_compile): change calls. + +Sat Sep 5 17:40:49 1992 Karl Berry (karl@hayley) + + * regex.h (_RE_ARGS): new macro to put in argument lists (if + ANSI) or omit them (if K&R); don't declare routines twice. + + * many files (obscure_syntax): rename to `re_default_syntax'. + +Fri Sep 4 09:06:53 1992 Karl Berry (karl@hayley) + + * GNUmakefile (extraclean): new target. + (realclean): delete the info files. + +Wed Sep 2 08:14:42 1992 Karl Berry (karl@hayley) + + * regex.h: doc fix. + +Sun Aug 23 06:53:15 1992 Karl Berry (karl@hayley) + + * regex.[ch] (re_comp): no const in the return type (from djm). + +Fri Aug 14 07:25:46 1992 Karl Berry (karl@hayley) + + * regex.c (DO_RANGE): declare variables as unsigned chars, not + signed chars (from jimb). + +Wed Jul 29 18:33:53 1992 Karl Berry (karl@claude.cs.umb.edu) + + * Version 0.9. + + * GNUmakefile (distclean): do not remove regex.texinfo. + (realclean): remove it here. + + * tregress.c (simple_test): initialize buf.buffer. + +Sun Jul 26 08:59:38 1992 Karl Berry (karl@hayley) + + * regex.c (push_dummy_failure): new opcode and corresponding + case in the various routines. Pushed at the end of + alternatives. + + * regex.c (jump_past_next_alt): rename to `jump_past_alt', for + brevity. + (no_pop_jump): rename to `jump'. + + * regex.c (regex_compile) [DEBUG]: terminate printing of pattern + with a newline. + + * NEWS: new file. + + * tregress.c (simple_{compile,match,test}): routines to simplify all + these little tests. + + * tregress.c: test for matching as much as possible. + +Fri Jul 10 06:53:32 1992 Karl Berry (karl@hayley) + + * Version 0.8. + +Wed Jul 8 06:39:31 1992 Karl Berry (karl@hayley) + + * regex.c (SIGN_EXTEND_CHAR): #undef any previous definition, as + ours should always work properly. + +Mon Jul 6 07:10:50 1992 Karl Berry (karl@hayley) + + * iregex.c (main) [DEBUG]: conditionalize the call to + print_compiled_pattern. + + * iregex.c (main): initialize buf.buffer to NULL. + * tregress (test_regress): likewise. + + * regex.c (alloca) [sparc]: #if on HAVE_ALLOCA_H instead. + + * tregress.c (test_regress): didn't have jla's test quite right. + +Sat Jul 4 09:02:12 1992 Karl Berry (karl@hayley) + + * regex.c (re_match_2): only REGEX_ALLOCATE all the register + vectors if the pattern actually has registers. + (match_end): new variable to avoid having to use best_regend[0]. + + * regex.c (IS_IN_FIRST_STRING): rename to FIRST_STRING_P. + + * regex.c: doc fixes. + + * tregess.c (test_regress): new fastmap test forwarded by rms. + + * tregress.c (test_regress): initialize the fastmap field. + + * tregress.c (test_regress): new test from jla that aborted + in re_search_2. + +Fri Jul 3 09:10:05 1992 Karl Berry (karl@hayley) + + * tregress.c (test_regress): add tests for translating charsets, + from kaoru. + + * GNUmakefile (common): add alloca.o. + * alloca.c: new file, copied from bison. + + * other.c (test_others): remove var `buf', since it's no longer used. + + * Below changes from ro@TechFak.Uni-Bielefeld.DE. + + * tregress.c (test_regress): initialize buf.allocated. + + * regex.c (re_compile_fastmap): initialize `succeed_n_p'. + + * GNUmakefile (regex): depend on $(common). + +Wed Jul 1 07:12:46 1992 Karl Berry (karl@hayley) + + * Version 0.7. + + * regex.c: doc fixes. + +Mon Jun 29 08:09:47 1992 Karl Berry (karl@fosse) + + * regex.c (pop_failure_point): change string vars to + `const char *' from `unsigned char *'. + + * regex.c: consolidate debugging stuff. + (print_partial_compiled_pattern): avoid enum clash. + +Mon Jun 29 07:50:27 1992 Karl Berry (karl@hayley) + + * xmalloc.c: new file. + * GNUmakefile (common): add it. + + * iregex.c (print_regs): new routine (from jimb). + (main): call it. + +Sat Jun 27 10:50:59 1992 Jim Blandy (jimb@pogo.cs.oberlin.edu) + + * xregex.c (re_match_2): When we have accepted a match and + restored d from best_regend[0], we need to set dend + appropriately as well. + +Sun Jun 28 08:48:41 1992 Karl Berry (karl@hayley) + + * tregress.c: rename from regress.c. + + * regex.c (print_compiled_pattern): improve charset case to ease + byte-counting. + Also, don't distinguish between Emacs and non-Emacs + {not,}wordchar opcodes. + + * regex.c (print_fastmap): move here. + * test.c: from here. + * regex.c (print_{{partial,}compiled_pattern,double_string}): + rename from ..._printer. Change calls here and in test.c. + + * regex.c: create from xregex.c and regexinc.c for once and for + all, and change the debug fns to be extern, instead of static. + * GNUmakefile: remove traces of xregex.c. + * test.c: put in externs, instead of including regexinc.c. + + * xregex.c: move interactive main program and scanstring to iregex.c. + * iregex.c: new file. + * upcase.c, printchar.c: new files. + + * various doc fixes and other cosmetic changes throughout. + + * regexinc.c (compiled_pattern_printer): change variable name, + for consistency. + (partial_compiled_pattern_printer): print other info about the + compiled pattern, besides just the opcodes. + * xregex.c (regex_compile) [DEBUG]: print the compiled pattern + when we're done. + + * xregex.c (re_compile_fastmap): in the duplicate case, set + `can_be_null' and return. + Also, set `bufp->can_be_null' according to a new variable, + `path_can_be_null'. + Also, rewrite main while loop to not test `p != NULL', since + we never set it that way. + Also, eliminate special `can_be_null' value for the endline case. + (re_search_2): don't test for the special value. + * regex.h (struct re_pattern_buffer): remove the definition. + +Sat Jun 27 15:00:40 1992 Karl Berry (karl@hayley) + + * xregex.c (re_compile_fastmap): remove the `RE_' from + `REG_RE_MATCH_NULL_AT_END'. + Also, assert the fastmap in the pattern buffer is non-null. + Also, reset `succeed_n_p' after we've + paid attention to it, instead of every time through the loop. + Also, in the `anychar' case, only clear fastmap['\n'] if the + syntax says to, and don't return prematurely. + Also, rearrange cases in some semblance of a rational order. + * regex.h (REG_RE_MATCH_NULL_AT_END): remove the `RE_' from the name. + + * other.c: take bug reports from here. + * regress.c: new file for them. + * GNUmakefile (test): add it. + * main.c (main): new possible test. + * test.h (test_type): new value in enum. + +Thu Jun 25 17:37:43 1992 Karl Berry (karl@hayley) + + * xregex.c (scanstring) [test]: new function from jimb to allow some + escapes. + (main) [test]: call it (on the string, not the pattern). + + * xregex.c (main): make return type `int'. + +Wed Jun 24 10:43:03 1992 Karl Berry (karl@hayley) + + * xregex.c (pattern_offset_t): change to `int', for the benefit + of patterns which compile to more than 2^15 bytes. + + * xregex.c (GET_BUFFER_SPACE): remove spurious braces. + + * xregex.texinfo (Using Registers): put in a stub to ``document'' + the new function. + * regex.h (re_set_registers) [!__STDC__]: declare. + * xregex.c (re_set_registers): declare K&R style (also move to a + different place in the file). + +Mon Jun 8 18:03:28 1992 Jim Blandy (jimb@pogo.cs.oberlin.edu) + + * regex.h (RE_NREGS): Doc fix. + + * xregex.c (re_set_registers): New function. + * regex.h (re_set_registers): Declaration for new function. + +Fri Jun 5 06:55:18 1992 Karl Berry (karl@hayley) + + * main.c (main): `return 0' instead of `exit (0)'. (From Paul Eggert) + + * regexinc.c (SIGN_EXTEND_CHAR): cast to unsigned char. + (extract_number, EXTRACT_NUMBER): don't bother to cast here. + +Tue Jun 2 07:37:53 1992 Karl Berry (karl@hayley) + + * Version 0.6. + + * Change copyrights to `1985, 89, ...'. + + * regex.h (REG_RE_MATCH_NULL_AT_END): new macro. + * xregex.c (re_compile_fastmap): initialize `can_be_null' to + `p==pend', instead of in the test at the top of the loop (as + it was, it was always being set). + Also, set `can_be_null'=1 if we would jump to the end of the + pattern in the `on_failure_jump' cases. + (re_search_2): check if `can_be_null' is 1, not nonzero. This + was the original test in rms' regex; why did we change this? + + * xregex.c (re_compile_fastmap): rename `is_a_succeed_n' to + `succeed_n_p'. + +Sat May 30 08:09:08 1992 Karl Berry (karl@hayley) + + * xregex.c (re_compile_pattern): declare `regnum' as `unsigned', + not `regnum_t', for the benefit of those patterns with more + than 255 groups. + + * xregex.c: rename `failure_stack' to `fail_stack', for brevity; + likewise for `match_nothing' to `match_null'. + + * regexinc.c (REGEX_REALLOCATE): take both the new and old + sizes, and copy only the old bytes. + * xregex.c (DOUBLE_FAILURE_STACK): pass both old and new. + * This change from Thorsten Ohl. + +Fri May 29 11:45:22 1992 Karl Berry (karl@hayley) + + * regexinc.c (SIGN_EXTEND_CHAR): define as `(signed char) c' + instead of relying on __CHAR_UNSIGNED__, to work with + compilers other than GCC. From Per Bothner. + + * main.c (main): change return type to `int'. + +Mon May 18 06:37:08 1992 Karl Berry (karl@hayley) + + * regex.h (RE_SYNTAX_AWK): typo in RE_RE_UNMATCHED... + +Fri May 15 10:44:46 1992 Karl Berry (karl@hayley) + + * Version 0.5. + +Sun May 3 13:54:00 1992 Karl Berry (karl@hayley) + + * regex.h (struct re_pattern_buffer): now it's just `regs_allocated'. + (REGS_UNALLOCATED, REGS_REALLOCATE, REGS_FIXED): new constants. + * xregex.c (regexec, re_compile_pattern): set the field appropriately. + (re_match_2): and use it. bufp can't be const any more. + +Fri May 1 15:43:09 1992 Karl Berry (karl@hayley) + + * regexinc.c: unconditionally include <sys/types.h>, first. + + * regex.h (struct re_pattern_buffer): rename + `caller_allocated_regs' to `regs_allocated_p'. + * xregex.c (re_compile_pattern): same change here. + (regexec): and here. + (re_match_2): reallocate registers if necessary. + +Fri Apr 10 07:46:50 1992 Karl Berry (karl@hayley) + + * regex.h (RE_SYNTAX{_POSIX,}_AWK): new definitions from Arnold. + +Sun Mar 15 07:34:30 1992 Karl Berry (karl at hayley) + + * GNUmakefile (dist): versionize regex.{c,h,texinfo}. + +Tue Mar 10 07:05:38 1992 Karl Berry (karl at hayley) + + * Version 0.4. + + * xregex.c (PUSH_FAILURE_POINT): always increment the failure id. + (DEBUG_STATEMENT) [DEBUG]: execute the statement even if `debug'==0. + + * xregex.c (pop_failure_point): if the saved string location is + null, keep the current value. + (re_match_2): at fail, test for a dummy failure point by + checking the restored pattern value, not string value. + (re_match_2): new case, `on_failure_keep_string_jump'. + (regex_compile): output this opcode in the .*\n case. + * regexinc.c (re_opcode_t): define the opcode. + (partial_compiled_pattern_pattern): add the new case. + +Mon Mar 9 09:09:27 1992 Karl Berry (karl at hayley) + + * xregex.c (regex_compile): optimize .*\n to output an + unconditional jump to the ., instead of pushing failure points + each time through the loop. + + * xregex.c (DOUBLE_FAILURE_STACK): compute the maximum size + ourselves (and correctly); change callers. + +Sun Mar 8 17:07:46 1992 Karl Berry (karl at hayley) + + * xregex.c (failure_stack_elt_t): change to `const char *', to + avoid warnings. + + * regex.h (re_set_syntax): declare this. + + * xregex.c (pop_failure_point) [DEBUG]: conditionally pass the + original strings and sizes; change callers. + +Thu Mar 5 16:35:35 1992 Karl Berry (karl at claude.cs.umb.edu) + + * xregex.c (regnum_t): new type for register/group numbers. + (compile_stack_elt_t, regex_compile): use it. + + * xregex.c (regexec): declare len as `int' to match re_search. + + * xregex.c (re_match_2): don't declare p1 twice. + + * xregex.c: change `while (1)' to `for (;;)' to avoid silly + compiler warnings. + + * regex.h [__STDC__]: use #if, not #ifdef. + + * regexinc.c (REGEX_REALLOCATE): cast the result of alloca to + (char *), to avoid warnings. + + * xregex.c (regerror): declare variable as const. + + * xregex.c (re_compile_pattern, re_comp): define as returning a const + char *. + * regex.h (re_compile_pattern, re_comp): likewise. + +Thu Mar 5 15:57:56 1992 Karl Berry (karl@hal) + + * xregex.c (regcomp): declare `syntax' as unsigned. + + * xregex.c (re_match_2): try to avoid compiler warnings about + unsigned comparisons. + + * GNUmakefile (test-xlc): new target. + + * regex.h (reg_errcode_t): remove trailing comma from definition. + * regexinc.c (re_opcode_t): likewise. + +Thu Mar 5 06:56:07 1992 Karl Berry (karl at hayley) + + * GNUmakefile (dist): add version numbers automatically. + (versionfiles): new variable. + (regex.{c,texinfo}): don't add version numbers here. + * regex.h: put in placeholder instead of the version number. + +Fri Feb 28 07:11:33 1992 Karl Berry (karl at hayley) + + * xregex.c (re_error_msg): declare const, since it is. + +Sun Feb 23 05:41:57 1992 Karl Berry (karl at fosse) + + * xregex.c (PAT_PUSH{,_2,_3}, ...): cast args to avoid warnings. + (regex_compile, regexec): return REG_NOERROR, instead + of 0, on success. + (boolean): define as char, and #define false and true. + * regexinc.c (STREQ): cast the result. + +Sun Feb 23 07:45:38 1992 Karl Berry (karl at hayley) + + * GNUmakefile (test-cc, test-hc, test-pcc): new targets. + + * regex.inc (extract_number, extract_number_and_incr) [DEBUG]: + only define if we are debugging. + + * xregex.c [_AIX]: do #pragma alloca first if necessary. + * regexinc.c [_AIX]: remove the #pragma from here. + + * regex.h (reg_syntax_t): declare as unsigned, and redo the enum + as #define's again. Some compilers do stupid things with enums. + +Thu Feb 20 07:19:47 1992 Karl Berry (karl at hayley) + + * Version 0.3. + + * xregex.c, regex.h (newline_anchor_match_p): rename to + `newline_anchor'; dumb idea to change the name. + +Tue Feb 18 07:09:02 1992 Karl Berry (karl at hayley) + + * regexinc.c: go back to original, i.e., don't include + <string.h> or define strchr. + * xregex.c (regexec): don't bother with adding characters after + newlines to the fastmap; instead, just don't use a fastmap. + * xregex.c (regcomp): set the buffer and fastmap fields to zero. + + * xregex.texinfo (GNU r.e. compiling): have to initialize more + than two fields. + + * regex.h (struct re_pattern_buffer): rename `newline_anchor' to + `newline_anchor_match_p', as we're back to two cases. + * xregex.c (regcomp, re_compile_pattern, re_comp): change + accordingly. + (re_match_2): at begline and endline, POSIX is not a special + case anymore; just check newline_anchor_match_p. + +Thu Feb 13 16:29:33 1992 Karl Berry (karl at hayley) + + * xregex.c (*empty_string*): rename to *null_string*, for brevity. + +Wed Feb 12 06:36:22 1992 Karl Berry (karl at hayley) + + * xregex.c (re_compile_fastmap): at endline, don't set fastmap['\n']. + (re_match_2): rewrite the begline/endline cases to take account + of the new field newline_anchor. + +Tue Feb 11 14:34:55 1992 Karl Berry (karl at hayley) + + * regexinc.c [!USG etc.]: include <strings.h> and define strchr + as index. + + * xregex.c (re_search_2): when searching backwards, declare `c' + as a char and use casts when using it as an array subscript. + + * xregex.c (regcomp): if REG_NEWLINE, set + RE_HAT_LISTS_NOT_NEWLINE. Set the `newline_anchor' field + appropriately. + (regex_compile): compile [^...] as matching a \n according to + the syntax bit. + (regexec): if doing REG_NEWLINE stuff, compile a fastmap and add + characters after any \n's to the newline. + * regex.h (RE_HAT_LISTS_NOT_NEWLINE): new syntax bit. + (struct re_pattern_buffer): rename `posix_newline' to + `newline_anchor', define constants for its values. + +Mon Feb 10 07:22:50 1992 Karl Berry (karl at hayley) + + * xregex.c (re_compile_fastmap): combine the code at the top and + bottom of the loop, as it's essentially identical. + +Sun Feb 9 10:02:19 1992 Karl Berry (karl at hayley) + + * xregex.texinfo (POSIX Translate Tables): remove this, as it + doesn't match the spec. + + * xregex.c (re_compile_fastmap): if we finish off a path, go + back to the top (to set can_be_null) instead of returning + immediately. + + * xregex.texinfo: changes from bob. + +Sat Feb 1 07:03:25 1992 Karl Berry (karl at hayley) + + * xregex.c (re_search_2): doc fix (from rms). + +Fri Jan 31 09:52:04 1992 Karl Berry (karl at hayley) + + * xregex.texinfo (GNU Searching): clarify the range arg. + + * xregex.c (re_match_2, at_endline_op_p): add extra parens to + get rid of GCC 2's (silly, IMHO) warning about && within ||. + + * xregex.c (common_op_match_empty_string_p): use + MATCH_NOTHING_UNSET_VALUE, not -1. + +Thu Jan 16 08:43:02 1992 Karl Berry (karl at hayley) + + * xregex.c (SET_REGS_MATCHED): only set the registers from + lowest to highest. + + * regexinc.c (MIN): new macro. + * xregex.c (re_match_2): only check min (num_regs, + regs->num_regs) when we set the returned regs. + + * xregex.c (re_match_2): set registers after the first + num_regs to -1 before we return. + +Tue Jan 14 16:01:42 1992 Karl Berry (karl at hayley) + + * xregex.c (re_match_2): initialize max (RE_NREGS, re_nsub + 1) + registers (from rms). + + * xregex.c, regex.h: don't abbreviate `19xx' to `xx'. + + * regexinc.c [!emacs]: include <sys/types.h> before <unistd.h>. + (from ro@thp.Uni-Koeln.DE). + +Thu Jan 9 07:23:00 1992 Karl Berry (karl at hayley) + + * xregex.c (*unmatchable): rename to `match_empty_string_p'. + (CAN_MATCH_NOTHING): rename to `REG_MATCH_EMPTY_STRING_P'. + + * regexinc.c (malloc, realloc): remove prototypes, as they can + cause clashes (from rms). + +Mon Jan 6 12:43:24 1992 Karl Berry (karl at claude.cs.umb.edu) + + * Version 0.2. + +Sun Jan 5 10:50:38 1992 Karl Berry (karl at hayley) + + * xregex.texinfo: bring more or less up-to-date. + * GNUmakefile (regex.texinfo): generate from regex.h and + xregex.texinfo. + * include.awk: new file. + + * xregex.c: change all calls to the fn extract_number_and_incr + to the macro. + + * xregex.c (re_match_2) [emacs]: in at_dot, use PTR_CHAR_POS + 1, + instead of bf_* and sl_*. Cast d to unsigned char *, to match + the declaration in Emacs' buffer.h. + [emacs19]: in before_dot, at_dot, and after_dot, likewise. + + * regexinc.c: unconditionally include <sys/types.h>. + + * regexinc.c (alloca) [!alloca]: Emacs config files sometimes + define this, so don't define it if it's already defined. + +Sun Jan 5 06:06:53 1992 Karl Berry (karl at fosse) + + * xregex.c (re_comp): fix type conflicts with regex_compile (we + haven't been compiling this). + + * regexinc.c (SIGN_EXTEND_CHAR): use `__CHAR_UNSIGNED__', not + `CHAR_UNSIGNED'. + + * regexinc.c (NULL) [!NULL]: define it (as zero). + + * regexinc.c (extract_number): remove the temporaries. + +Sun Jan 5 07:50:14 1992 Karl Berry (karl at hayley) + + * regex.h (regerror) [!__STDC__]: return a size_t, not a size_t *. + + * xregex.c (PUSH_FAILURE_POINT, ...): declare `destination' as + `char *' instead of `void *', to match alloca declaration. + + * xregex.c (regerror): use `size_t' for the intermediate values + as well as the return type. + + * xregex.c (regexec): cast the result of malloc. + + * xregex.c (regexec): don't initialize `private_preg' in the + declaration, as old C compilers can't do that. + + * xregex.c (main) [test]: declare printchar void. + + * xregex.c (assert) [!DEBUG]: define this to do nothing, and + remove #ifdef DEBUG's from around asserts. + + * xregex.c (re_match_2): remove error message when not debugging. + +Sat Jan 4 09:45:29 1992 Karl Berry (karl at hayley) + + * other.c: test the bizarre duplicate case in re_compile_fastmap + that I just noticed. + + * test.c (general_test): don't test registers beyond the end of + correct_regs, as well as regs. + + * xregex.c (regex_compile): at handle_close, don't assign to + *inner_group_loc if we didn't push a start_memory (because the + group number was too big). In fact, don't push or pop the + inner_group_offset in that case. + + * regex.c: rename to xregex.c, since it's not the whole thing. + * regex.texinfo: likewise. + * GNUmakefile: change to match. + + * regex.c [DEBUG]: only include <stdio.h> if debugging. + + * regexinc.c (SIGN_EXTEND_CHAR) [CHAR_UNSIGNED]: if it's already + defined, don't redefine it. + + * regex.c: define _GNU_SOURCE at the beginning. + * regexinc.c (isblank) [!isblank]: define it. + (isgraph) [!isgraph]: change conditional to this, and remove the + sequent stuff. + + * regex.c (regex_compile): add `blank' character class. + + * regex.c (regex_compile): don't use a uchar variable to loop + through all characters. + + * regex.c (regex_compile): at '[', improve logic for checking + that we have enough space for the charset. + + * regex.h (struct re_pattern_buffer): declare translate as char + * again. We only use it as an array subscript once, I think. + + * regex.c (TRANSLATE): new macro to cast the data character + before subscripting. + (num_internal_regs): rename to `num_regs'. + +Fri Jan 3 07:58:01 1992 Karl Berry (karl at hayley) + + * regex.h (struct re_pattern_buffer): declare `allocated' and + `used' as unsigned long, since these are never negative. + + * regex.c (compile_stack_element): rename to compile_stack_elt_t. + (failure_stack_element): similarly. + + * regexinc.c (TALLOC, RETALLOC): new macros to simplify + allocation of arrays. + + * regex.h (re_*) [__STDC__]: don't declare string args unsigned + char *; that makes them incompatible with string constants. + (struct re_pattern_buffer): declare the pattern and translate + table as unsigned char *. + * regex.c (most routines): use unsigned char vs. char consistently. + + * regex.h (re_compile_pattern): do not declare the length arg as + const. + * regex.c (re_compile_pattern): likewise. + + * regex.c (POINTER_TO_REG): rename to `POINTER_TO_OFFSET'. + + * regex.h (re_registers): declare `start' and `end' as + `regoff_t', instead of `int'. + + * regex.c (regexec): if either of the malloc's for the register + information fail, return failure. + + * regex.h (RE_NREGS): define this again, as 30 (from jla). + (RE_ALLOCATE_REGISTERS): remove this. + (RE_SYNTAX_*): remove it from definitions. + (re_pattern_buffer): remove `return_default_num_regs', add + `caller_allocated_regs'. + * regex.c (re_compile_pattern): clear no_sub and + caller_allocated_regs in the pattern. + (regcomp): set caller_allocated_regs. + (re_match_2): do all register allocation at the end of the + match; implement new semantics. + + * regex.c (MAX_REGNUM): new macro. + (regex_compile): at handle_open and handle_close, if the group + number is too large, don't push the start/stop memory. + +Thu Jan 2 07:56:10 1992 Karl Berry (karl at hayley) + + * regex.c (re_match_2): if the back reference is to a group that + never matched, then goto fail, not really_fail. Also, don't + test if the pattern can match the empty string. Why did we + ever do that? + (really_fail): this label no longer needed. + + * regexinc.c [STDC_HEADERS]: use only this to test if we should + include <stdlib.h>. + + * regex.c (DO_RANGE, regex_compile): translate in all cases + except the single character after a \. + + * regex.h (RE_AWK_CLASS_HACK): rename to + RE_BACKSLASH_ESCAPE_IN_LISTS. + * regex.c (regex_compile): change use. + + * regex.c (re_compile_fastmap): do not translate the characters + again; we already translated them at compilation. (From ylo@ngs.fi.) + + * regex.c (re_match_2): in case for at_dot, invert sense of + comparison and find the character number properly. (From + worley@compass.com.) + (re_match_2) [emacs]: remove the cases for before_dot and + after_dot, since there's no way to specify them, and the code + is wrong (judging from this change). + +Wed Jan 1 09:13:38 1992 Karl Berry (karl at hayley) + + * psx-{interf,basic,extend}.c, other.c: set `t' as the first + thing, so that if we run them in sucession, general_test's + kludge to see if we're doing POSIX tests works. + + * test.h (test_type): add `all_test'. + * main.c: add case for `all_test'. + + * regexinc.c (partial_compiled_pattern_printer, + double_string_printer): don't print anything if we're passed null. + + * regex.c (PUSH_FAILURE_POINT): do not scan for the highest and + lowest active registers. + (re_match_2): compute lowest/highest active regs at start_memory and + stop_memory. + (NO_{LOW,HIGH}EST_ACTIVE_REG): new sentinel values. + (pop_failure_point): return the lowest/highest active reg values + popped; change calls. + + * regex.c [DEBUG]: include <assert.h>. + (various routines) [DEBUG]: change conditionals to assertions. + + * regex.c (DEBUG_STATEMENT): new macro. + (PUSH_FAILURE_POINT): use it to increment num_regs_pushed. + (re_match_2) [DEBUG]: only declare num_regs_pushed if DEBUG. + + * regex.c (*can_match_nothing): rename to *unmatchable. + + * regex.c (re_match_2): at stop_memory, adjust argument reading. + + * regex.h (re_pattern_buffer): declare `can_be_null' as a 2-bit + bit field. + + * regex.h (re_pattern_buffer): declare `buffer' unsigned char *; + no, dumb idea. The pattern can have signed number. + + * regex.c (re_match_2): in maybe_pop_jump case, skip over the + right number of args to the group operators, and don't do + anything with endline if posix_newline is not set. + + * regex.c, regexinc.c (all the things we just changed): go back + to putting the inner group count after the start_memory, + because we need it in the on_failure_jump case in re_match_2. + But leave it after the stop_memory also, since we need it + there in re_match_2, and we don't have any way of getting back + to the start_memory. + + * regexinc.c (partial_compiled_pattern_printer): adjust argument + reading for start/stop_memory. + * regex.c (re_compile_fastmap, group_can_match_nothing): likewise. + +Tue Dec 31 10:15:08 1991 Karl Berry (karl at hayley) + + * regex.c (bits list routines): remove these. + (re_match_2): get the number of inner groups from the pattern, + instead of keeping track of it at start and stop_memory. + Put the count after the stop_memory, not after the + start_memory. + (compile_stack_element): remove `fixup_inner_group' member, + since we now put it in when we can compute it. + (regex_compile): at handle_open, don't push the inner group + offset, and at handle_close, don't pop it. + + * regex.c (level routines): remove these, and their uses in + regex_compile. This was another manifestation of having to find + $'s that were endlines. + + * regex.c (regexec): this does searching, not matching (a + well-disguised part of the standard). So rewrite to use + `re_search' instead of `re_match'. + * psx-interf.c (test_regexec): add tests to, uh, match. + + * regex.h (RE_TIGHT_ALT): remove this; nobody uses it. + * regex.c: remove the code that was supposed to implement it. + + * other.c (test_others): ^ and $ never match newline characters; + RE_CONTEXT_INVALID_OPS doesn't affect anchors. + + * psx-interf.c (test_regerror): update for new error messages. + + * psx-extend.c: it's now ok to have an alternative be just a $, + so remove all the tests which supposed that was invalid. + +Wed Dec 25 09:00:05 1991 Karl Berry (karl at hayley) + + * regex.c (regex_compile): in handle_open, don't skip over ^ and + $ when checking for an empty group. POSIX has changed the + grammar. + * psx-extend.c (test_posix_extended): thus, move (^$) tests to + valid section. + + * regexinc.c (boolean): move from here to test.h and regex.c. + * test files: declare verbose, omit_register_tests, and + test_should_match as boolean. + + * psx-interf.c (test_posix_c_interface): remove the `c_'. + * main.c: likewise. + + * psx-basic.c (test_posix_basic): ^ ($) is an anchor after + (before) an open (close) group. + + * regex.c (re_match_2): in endline, correct precedence of + posix_newline condition. + +Tue Dec 24 06:45:11 1991 Karl Berry (karl at hayley) + + * test.h: incorporate private-tst.h. + * test files: include test.h, not private-tst.h. + + * test.c (general_test): set posix_newline to zero if we are + doing POSIX tests (unfortunately, it's difficult to call + regcomp in this case, which is what we should really be doing). + + * regex.h (reg_syntax_t): make this an enumeration type which + defines the syntax bits; renames re_syntax_t. + + * regex.c (at_endline_op_p): don't preincrement p; then if it's + not an empty string op, we lose. + + * regex.h (reg_errcode_t): new enumeration type of the error + codes. + * regex.c (regex_compile): return that type. + + * regex.c (regex_compile): in [, initialize + just_had_a_char_class to false; somehow I had changed this to + true. + + * regex.h (RE_NO_CONSECUTIVE_REPEATS): remove this, since we + don't use it, and POSIX doesn't require this behavior anymore. + * regex.c (regex_compile): remove it from here. + + * regex.c (regex_compile): remove the no_op insertions for + verify_and_adjust_endlines, since that doesn't exist anymore. + + * regex.c (regex_compile) [DEBUG]: use printchar to print the + pattern, so unprintable bytes will print properly. + + * regex.c: move re_error_msg back. + * test.c (general_test): print the compile error if the pattern + was invalid. + +Mon Dec 23 08:54:53 1991 Karl Berry (karl at hayley) + + * regexinc.c: move re_error_msg here. + + * regex.c (re_error_msg): the ``message'' for success must be + NULL, to keep the interface to re_compile_pattern the same. + (regerror): if the msg is null, use "Success". + + * rename most test files for consistency. Change Makefile + correspondingly. + + * test.c (most routines): add casts to (unsigned char *) when we + call re_{match,search}{,_2}. + +Sun Dec 22 09:26:06 1991 Karl Berry (karl at hayley) + + * regex.c (re_match_2): declare string args as unsigned char * + again; don't declare non-pointer args const; declare the + pattern buffer const. + (re_match): likewise. + (re_search_2, re_search): likewise, except don't declare the + pattern const, since we make a fastmap. + * regex.h [__STDC__]: change prototypes. + + * regex.c (regex_compile): return an error code, not a string. + (re_err_list): new table to map from error codes to string. + (re_compile_pattern): return an element of re_err_list. + (regcomp): don't test all the strings. + (regerror): just use the list. + (put_in_buffer): remove this. + + * regex.c (equivalent_failure_points): remove this. + + * regex.c (re_match_2): don't copy the string arguments into + non-const pointers. We never alter the data. + + * regex.c (re_match_2): move assignment to `is_a_jump_n' out of + the main loop. Just initialize it right before we do + something with it. + + * regex.[ch] (re_match_2): don't declare the int parameters const. + +Sat Dec 21 08:52:20 1991 Karl Berry (karl at hayley) + + * regex.h (re_syntax_t): new type; declare to be unsigned + (previously we used int, but since we do bit operations on + this, unsigned is better, according to H&S). + (obscure_syntax, re_pattern_buffer): use that type. + * regex.c (re_set_syntax, regex_compile): likewise. + + * regex.h (re_pattern_buffer): new field `posix_newline'. + * regex.c (re_comp, re_compile_pattern): set to zero. + (regcomp): set to REG_NEWLINE. + * regex.h (RE_HAT_LISTS_NOT_NEWLINE): remove this (we can just + check `posix_newline' instead.) + + * regex.c (op_list_type, op_list, add_op): remove these. + (verify_and_adjust_endlines): remove this. + (pattern_offset_list_type, *pattern_offset* routines): and these. + These things all implemented the nonleading/nontrailing position + code, which was very long, had a few remaining problems, and + is no longer needed. So... + + * regexinc.c (STREQ): new macro to abbreviate strcmp(,)==0, for + brevity. Change various places in regex.c to use it. + + * regex{,inc}.c (enum regexpcode): change to a typedef + re_opcode_t, for brevity. + + * regex.h (re_syntax_table) [SYNTAX_TABLE]: remove this; it + should only be in regex.c, I think, since we don't define it + in this case. Maybe it should be conditional on !SYNTAX_TABLE? + + * regexinc.c (partial_compiled_pattern_printer): simplify and + distinguish the emacs/not-emacs (not)wordchar cases. + +Fri Dec 20 08:11:38 1991 Karl Berry (karl at hayley) + + * regexinc.c (regexpcode) [emacs]: only define the Emacs opcodes + if we are ifdef emacs. + + * regex.c (BUF_PUSH*): rename to PAT_PUSH*. + + * regex.c (regex_compile): in $ case, go back to essentially the + original code for deciding endline op vs. normal char. + (at_endline_op_p): new routine. + * regex.h (RE_ANCHORS_ONLY_AT_ENDS, RE_CONTEXT_INVALID_ANCHORS, + RE_REPEATED_ANCHORS_AWAY, RE_NO_ANCHOR_AT_NEWLINE): remove + these. POSIX has simplified the rules for anchors in draft + 11.2. + (RE_NEWLINE_ORDINARY): new syntax bit. + (RE_CONTEXT_INDEP_ANCHORS): change description to be compatible + with POSIX. + * regex.texinfo (Syntax Bits): remove the descriptions. + +Mon Dec 16 08:12:40 1991 Karl Berry (karl at hayley) + + * regex.c (re_match_2): in jump_past_next_alt, unconditionally + goto no_pop. The only register we were finding was one which + enclosed the whole alternative expression, not one around an + individual alternative. So we were never doing what we + thought we were doing, and this way makes (|a) against the + empty string fail. + + * regex.c (regex_compile): remove `highest_ever_regnum', and + don't restore regnum from the stack; just put it into a + temporary to put into the stop_memory. Otherwise, groups + aren't numbered consecutively. + + * regex.c (is_in_compile_stack): rename to + `group_in_compile_stack'; remove unnecessary test for the + stack being empty. + + * regex.c (re_match_2): in on_failure_jump, skip no_op's before + checking for the start_memory, in case we were called from + succeed_n. + +Sun Dec 15 16:20:48 1991 Karl Berry (karl at hayley) + + * regex.c (regex_compile): in duplicate case, use + highest_ever_regnum instead of regnum, since the latter is + reverted at stop_memory. + + * regex.c (re_match_2): in on_failure_jump, if the * applied to + a group, save the information for that group and all inner + groups (by making it active), even though we're not inside it + yet. + +Sat Dec 14 09:50:59 1991 Karl Berry (karl at hayley) + + * regex.c (PUSH_FAILURE_ITEM, POP_FAILURE_ITEM): new macros. + Use them instead of copying the stack manipulating a zillion + times. + + * regex.c (PUSH_FAILURE_POINT, pop_failure_point) [DEBUG]: save + and restore a unique identification value for each failure point. + + * regexinc.c (partial_compiled_pattern_printer): don't print an + extra / after duplicate commands. + + * regex.c (regex_compile): in back-reference case, allow a back + reference to register `regnum'. Otherwise, even `\(\)\1' + fails, since regnum is 1 at the back-reference. + + * regex.c (re_match_2): in fail, don't examine the pattern if we + restored to pend. + + * test_private.h: rename to private_tst.h. Change includes. + + * regex.c (extend_bits_list): compute existing size for realloc + in bytes, not blocks. + + * regex.c (re_match_2): in jump_past_next_alt, the for loop was + missing its (empty) statement. Even so, some register tests + still fail, although in a different way than in the previous change. + +Fri Dec 13 15:55:08 1991 Karl Berry (karl at hayley) + + * regex.c (re_match_2): in jump_past_next_alt, unconditionally + goto no_pop, since we weren't properly detecting if the + alternative matched something anyway. No, we need to not jump + to keep the register values correct; just change to not look at + register zero and not test RE_NO_EMPTY_ALTS (which is a + compile-time thing). + + * regex.c (SET_REGS_MATCHED): start the loop at 1, since we never + care about register zero until the very end. (I think.) + + * regex.c (PUSH_FAILURE_POINT, pop_failure_point): go back to + pushing and popping the active registers, instead of only doing + the registers before a group: (fooq|fo|o)*qbar against fooqbar + fails, since we restore back into the middle of group 1, yet it + isn't active, because the previous restore clobbered the active flag. + +Thu Dec 12 17:25:36 1991 Karl Berry (karl at hayley) + + * regex.c (PUSH_FAILURE_POINT): do not call + `equivalent_failure_points' after all; it causes the registers + to be ``wrong'' (according to POSIX), and an infinite loop on + `((a*)*)*' against `ab'. + + * regex.c (re_compile_fastmap): don't push `pend' on the failure + stack. + +Tue Dec 10 10:30:03 1991 Karl Berry (karl at hayley) + + * regex.c (PUSH_FAILURE_POINT): if pushing same failure point that + is on the top of the stack, fail. + (equivalent_failure_points): new routine. + + * regex.c (re_match_2): add debug statements for every opcode we + execute. + + * regex.c (regex_compile/handle_close): restore + `fixup_inner_group_count' and `regnum' from the stack. + +Mon Dec 9 13:51:15 1991 Karl Berry (karl at hayley) + + * regex.c (PUSH_FAILURE_POINT): declare `this_reg' as int, so + unsigned arithmetic doesn't happen when we don't want to save + the registers. + +Tue Dec 3 08:11:10 1991 Karl Berry (karl at hayley) + + * regex.c (extend_bits_list): divide size by bits/block. + + * regex.c (init_bits_list): remove redundant assignmen to + `bits_list_ptr'. + + * regexinc.c (partial_compiled_pattern_printer): don't do *p++ + twice in the same expr. + + * regex.c (re_match_2): at on_failure_jump, use the correct + pattern positions for getting the stuff following the start_memory. + + * regex.c (struct register_info): remove the bits_list for the + inner groups; make that a separate variable. + +Mon Dec 2 10:42:07 1991 Karl Berry (karl at hayley) + + * regex.c (PUSH_FAILURE_POINT): don't pass `failure_stack' as an + arg; change callers. + + * regex.c (PUSH_FAILURE_POINT): print items in order they are + pushed. + (pop_failure_point): likewise. + + * regex.c (main): prompt for the pattern and string. + + * regex.c (FREE_VARIABLES) [!REGEX_MALLOC]: declare as nothing; + remove #ifdefs from around calls. + + * regex.c (extract_number, extract_number_and_incr): declare static. + + * regex.c: remove the canned main program. + * main.c: new file. + * Makefile (COMMON): add main.o. + +Tue Sep 24 06:26:51 1991 Kathy Hargreaves (kathy at fosse) + + * regex.c (re_match_2): Made `pend' and `dend' not register variables. + Only set string2 to string1 if string1 isn't null. + Send address of p, d, regstart, regend, and reg_info to + pop_failure_point. + Put in more debug statements. + + * regex.c [debug]: Added global variable. + (DEBUG_*PRINT*): Only print if `debug' is true. + (DEBUG_DOUBLE_STRING_PRINTER): Changed DEBUG_STRING_PRINTER's + name to this. + Changed some comments. + (PUSH_FAILURE_POINT): Moved and added some debugging statements. + Was saving regstart on the stack twice instead of saving both + regstart and regend; remedied this. + [NUM_REGS_ITEMS]: Changed from 3 to 4, as now save lowest and + highest active registers instead of highest used one. + [NUM_NON_REG_ITEMS]: Changed name of NUM_OTHER_ITEMS to this. + (NUM_FAILURE_ITEMS): Use active registers instead of number 0 + through highest used one. + (re_match_2): Have pop_failure_point put things in the variables. + (pop_failure_point): Have it do what the fail case in re_match_2 + did with the failure stack, instead of throwing away the stuff + popped off. re_match_2 can ignore results when it doesn't + need them. + + +Thu Sep 5 13:23:28 1991 Kathy Hargreaves (kathy at fosse) + + * regex.c (banner): Changed copyright years to be separate. + + * regex.c [CHAR_UNSIGNED]: Put __ at both ends of this name. + [DEBUG, debug_count, *debug_p, DEBUG_PRINT_1, DEBUG_PRINT_2, + DEBUG_COMPILED_PATTERN_PRINTER ,DEBUG_STRING_PRINTER]: + defined these for debugging. + (extract_number): Added this (debuggable) routine version of + the macro EXTRACT_NUMBER. Ditto for EXTRACT_NUMBER_AND_INCR. + (re_compile_pattern): Set return_default_num_regs if the + syntax bit RE_ALLOCATE_REGISTERS is set. + [REGEX_MALLOC]: Renamed USE_ALLOCA to this. + (BUF_POP): Got rid of this, as don't ever use it. + (regex_compile): Made the type of `pattern' not be register. + If DEBUG, print the pattern to compile. + (re_match_2): If had a `$' in the pattern before a `^' then + don't record the `^' as an anchor. + Put (enum regexpcode) before references to b, as suggested + [RE_NO_BK_BRACES]: Changed RE_NO_BK_CURLY_BRACES to this. + (remove_pattern_offset): Removed this unused routine. + (PUSH_FAILURE_POINT): Changed to only save active registers. + Put in debugging statements. + (re_compile_fastmap): Made `pattern' not a register variable. + Use routine for extracting numbers instead of macro. + (re_match_2): Made `p', `mcnt' and `mcnt2' not register variables. + Added `num_regs_pushed' for debugging. + Only malloc registers if the syntax bit RE_ALLOCATE_REGISTERS is set. + Put in debug statements. + Put the macro NOTE_INNER_GROUP's code inline, as it was the + only called in one place. + For debugging, extract numbers using routines instead of macros. + In case fail: only restore pushed active registers, and added + debugging statements. + (pop_failure_point): Test for underfull stack. + (group_can_match_nothing, common_op_can_match_nothing): For + debugging, extract numbers using routines instead of macros. + (regexec): Changed formal parameters to not be prototypes. + Don't initialize `regs' or `private_preg' in their declarations. + +Tue Jul 23 18:38:36 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h [RE_CONTEX_INDEP_OPS]: Moved the anchor stuff out of + this bit. + [RE_UNMATCHED_RIGHT_PAREN_ORD]: Defined this bit. + [RE_CONTEXT_INVALID_ANCHORS]: Defined this bit. + [RE_CONTEXT_INDEP_ANCHORS]: Defined this bit. + Added RE_CONTEXT_INDEP_ANCHORS to all syntaxes which had + RE_CONTEXT_INDEP_OPS. + Took RE_ANCHORS_ONLY_AT_ENDS out of the POSIX basic syntax. + Added RE_UNMATCHED_RIGHT_PAREN_ORD to the POSIX extended + syntax. + Took RE_REPEATED_ANCHORS_AWAY out of the POSIX extended syntax. + Defined REG_NOERROR (which will probably have to go away again). + Changed the type `off_t' to `regoff_t'. + + * regex.c: Changed some commments. + (regex_compile): Added variable `had_an_endline' to keep track + of if hit a `$' since the beginning of the pattern or the last + alternative (if any). + Changed RE_CONTEXT_INVALID_OPS and RE_CONTEXT_INDEP_OPS to + RE_CONTEXT_INVALID_ANCHORS and RE_CONTEXT_INDEP_ANCHORS where + appropriate. + Put a `no_op' in the pattern if a repeat is only zero or one + times; in this case and if it is many times (whereupon a jump + backwards is pushed instead), keep track of the operator for + verify_and_adjust_endlines. + If RE_UNMATCHED_RIGHT_PAREN is set, make an unmatched + close-group operator match `)'. + Changed all error exits to exit (1). + (remove_pattern_offset): Added this routine, but don't use it. + (verify_and_adjust_endlines): At top of routine, if initialize + routines run out of memory, return true after setting + enough_memory false. + At end of endline, et al. case, don't set *p to no_op. + Repetition operators also set the level and active groups' + match statuses, unless RE_REPEATED_ANCHORS_AWAY is set. + (get_group_match_status): Put a return in front of call to get_bit. + (re_compile_fastmap): Changed is_a_succeed_n to a boolean. + If at end of pattern, then if the failure stack isn't empty, + go back to the failure point. + In *jump* case, only pop the stack if what's on top of it is + where we've just jumped to. + (re_search_2): Return -2 instead of val if val is -2. + (group_can_match_nothing, alternative_can_match_nothing, + common_op_can-match_nothing): Now pass in reg_info for the + `duplicate' case. + (re_match_2): Don't skip over the next alternative also if + empty alternatives aren't allowed. + In fail case, if failed to a backwards jump that's part of a + repetition loop, pop the current failure point and use the + next one. + (pop_failure_point): Check that there's as many register items + on the failure stack as the stack says there are. + (common_op_can_match_nothing): Added variables `ret' and + `reg_no' so can set reg_info for the group encountered. + Also break without doing anything if hit a no_op or the other + kinds of `endline's. + If not done already, set reg_info in start_memory case. + Put in no_pop_jump for an optimized succeed_n of zero repetitions. + In succeed_n case, if the number isn't zero, then return false. + Added `duplicate' case. + +Sat Jul 13 11:27:38 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h (REG_NOERROR): Added this error code definition. + + * regex.c: Took some redundant parens out of macros. + (enum regexpcode): Added jump_past_next_alt. + Wrapped some macros in `do..while (0)'. + Changed some comments. + (regex_compile): Use `fixup_alt_jump' instead of `fixup_jump'. + Use `maybe_pop_jump' instead of `maybe_pop_failure_jump'. + Use `jump_past_next_alt' instead of `no_pop_jump' when at the + end of an alternative. + (re_match_2): Used REGEX_ALLOCATE for the registers stuff. + In stop_memory case: Add more boolean tests to see if the + group is in a loop. + Added jump_past_next_alt case, which doesn't jump over the + next alternative if the last one didn't match anything. + Unfortunately, to make this work with, e.g., `(a+?*|b)*' + against `bb', I also had to pop the alternative's failure + point, which in turn broke backtracking! + In fail case: Detect a dummy failure point by looking at + failure_stack.avail - 2, not stack[-2]. + (pop_failure_point): Only pop if the stack isn't empty; don't + give an error if it is. (Not sure yet this is correct.) + (group_can_match_nothing): Make it return a boolean instead of int. + Make it take an argument indicating the end of where it should look. + If find a group that can match nothing, set the pointer + argument to past the group in the pattern. + Took out cases which can share with alternative_can_match_nothing + and call common_op_can_match_nothing. + Took ++ out of switch, so could call common_op_can_match_nothing. + Wrote lots more for on_failure_jump case to handle alternatives. + Main loop now doesn't look for matching stop_memory, but + rather the argument END; return true if hit the matching + stop_memory; this way can call itself for inner groups. + (alternative_can_match_nothing): Added for alternatives. + (common_op_can_match_nothing): Added for previous two routines' + common operators. + (regerror): Returns a message saying there's no error if gets + sent REG_NOERROR. + +Wed Jul 3 10:43:15 1991 Kathy Hargreaves (kathy at hayley) + + * regex.c: Removed unnecessary enclosing parens from several macros. + Put `do..while (0)' around a few. + Corrected some comments. + (INIT_FAILURE_STACK_SIZE): Deleted in favor of using + INIT_FAILURE_ALLOC. + (INIT_FAILURE_STACK, DOUBLE_FAILURE_STACK, PUSH_PATTERN_OP, + PUSH_FAILURE_POINT): Made routines of the same name (but with all + lowercase letters) into these macros, so could use `alloca' + when USE_ALLOCA is defined. The reason is stated below for + bits lists. Deleted analogous routines. + (re_compile_fastmap): Added variable void *destination for + PUSH_PATTERN_OP. + (re_match_2): Added variable void *destination for REGEX_REALLOCATE. + Used the failure stack macros in place of the routines. + Detected a dummy failure point by inspecting the failure stack's + (avail - 2)th element, not failure_stack.stack[-2]. This bug + arose when used the failure stack macros instead of the routines. + + * regex.c [USE_ALLOCA]: Put this conditional around previous + alloca stuff and defined these to work differently depending + on whether or not USE_ALLOCA is defined: + (REGEX_ALLOCATE): Uses either `alloca' or `malloc'. + (REGEX_REALLOCATE): Uses either `alloca' or `realloc'. + (INIT_BITS_LIST, EXTEND_BITS_LIST, SET_BIT_TO_VALUE): Defined + macro versions of routines with the same name (only with all + lowercase letters) so could use `alloc' in re_match_2. This + is to prevent core leaks when C-g is used in Emacs and to make + things faster and avoid storage fragmentation. These things + have to be macros because the results of `alloca' go away with + the routine by which it's called. + (BITS_BLOCK_SIZE, BITS_BLOCK, BITS_MASK): Moved to above the + above-mentioned macros instead of before the routines defined + below regex_compile. + (set_bit_to_value): Compacted some code. + (reg_info_type): Changed inner_groups field to be bits_list_type + so could be arbitrarily long and thus handle arbitrary nesting. + (NOTE_INNER_GROUP): Put `do...while (0)' around it so could + use as a statement. + Changed code to use bits lists. + Added variable void *destination for REGEX_REALLOCATE (whose call + is several levels in). + Changed variable name of `this_bit' to `this_reg'. + (FREE_VARIABLES): Only define and use if USE_ALLOCA is defined. + (re_match_2): Use REGEX_ALLOCATE instead of malloc. + Instead of setting INNER_GROUPS of reg_info to zero, have to + use INIT_BITS_LIST and return -2 (and free variables if + USE_ALLOCA isn't defined) if it fails. + +Fri Jun 28 13:45:07 1991 Karl Berry (karl at hayley) + + * regex.c (re_match_2): set value of `dend' when we restore `d'. + + * regex.c: remove declaration of alloca. + + * regex.c (MISSING_ISGRAPH): rename to `ISGRAPH_MISSING'. + + * regex.h [_POSIX_SOURCE]: remove these conditionals; always + define POSIX stuff. + * regex.c (_POSIX_SOURCE): change conditionals to use `POSIX' + instead. + +Sat Jun 1 16:56:50 1991 Kathy Hargreaves (kathy at hayley) + + * regex.*: Changed RE_CONTEXTUAL_* to RE_CONTEXT_*, + RE_TIGHT_VBAR to RE_TIGHT_ALT, RE_NEWLINE_OR to + RE_NEWLINE_ALT, and RE_DOT_MATCHES_NEWLINE to RE_DOT_NEWLINE. + +Wed May 29 09:24:11 1991 Karl Berry (karl at hayley) + + * regex.texinfo (POSIX Pattern Buffers): cross-reference the + correct node name (Match-beginning-of-line, not ..._line). + (Syntax Bits): put @code around all syntax bits. + +Sat May 18 16:29:58 1991 Karl Berry (karl at hayley) + + * regex.c (global): add casts to keep broken compilers from + complaining about malloc and realloc calls. + + * regex.c (isgraph) [MISSING_ISGRAPH]: change test to this, + instead of `#ifndef isgraph', since broken compilers can't + have both a macro and a symbol by the same name. + + * regex.c (re_comp, re_exec) [_POSIX_SOURCE]: do not define. + (regcomp, regfree, regexec, regerror) [_POSIX_SOURCE && !emacs]: + only define in this case. + +Mon May 6 17:37:04 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h (re_search, re_search_2): Changed BUFFER to not be const. + + * regex.c (re_compile_pattern): `^' is in a leading position if + it precedes a newline. + (various routines): Added or changed header comments. + (double_pattern_offsets_list): Changed name from + `extend_pattern_offsets_list'. + (adjust_pattern_offsets_list): Changed return value from + unsigned to void. + (verify_and_adjust_endlines): Now returns `true' and `false' + instead of 1 and 0. + `$' is in a leading position if it follows a newline. + (set_bit_to_value, get_bit_value): Exit with error if POSITION < 0 + so now calling routines don't have to. + (init_failure_stack, inspect_failure_stack_top, + pop_failure_stack_top, push_pattern_op, double_failure_stack): + Now return value unsigned instead of boolean. + (re_search, re_search_2): Changed BUFP to not be const. + (re_search_2): Added variable const `private_bufp' to send to + re_match_2. + (push_failure_point): Made return value unsigned instead of boolean. + +Sat May 4 15:32:22 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h (re_compile_fastmap): Added extern for this. + Changed some comments. + + * regex.c (re_compile_pattern): In case handle_bar: put invalid + pattern test before levels matching stuff. + Changed some commments. + Added optimizing test for detecting an empty alternative that + ends with a trailing '$' at the end of the pattern. + (re_compile_fastmap): Moved failure_stack stuff to before this + so could use it. Made its stack dynamic. + Made it return an int so that it could return -2 if its stack + couldn't be allocated. + Added to header comment (about the return values). + (init_failure_stack): Wrote so both re_match_2 and + re_compile_fastmap could use it similar stacks. + (double_failure_stack): Added for above reasons. + (push_pattern_op): Wrote for re_compile_fastmap. + (re_search_2): Now return -2 if re_compile_fastmap does. + (re_match_2): Made regstart and regend type failure_stack_element*. + (push_failure_point): Made pattern_place and string_place type + failure_stack_element*. + Call double_failure_stack now. + Return true instead of 1. + +Wed May 1 12:57:21 1991 Kathy Hargreaves (kathy at hayley) + + * regex.c (remove_intervening_anchors): Avoid erroneously making + ops into no_op's by making them no_op only when they're beglines. + (verify_and_adjust_endlines): Don't make '$' a normal character + if it's before a newline. + Look for the endline op in *p, not p[1]. + (failure_stack_element): Added this declaration. + (failure_stack_type): Added this declaration. + (INIT_FAILURE_STACK_SIZE, FAILURE_STACK_EMPTY, + FAILURE_STACK_PTR_EMPTY, REMAINING_AVAIL_SLOTS): Added for + failure stack. + (FAILURE_ITEM_SIZE, PUSH_FAILURE_POINT): Deleted. + (FREE_VARIABLES): Now free failure_stack.stack instead of stackb. + (re_match_2): deleted variables `initial_stack', `stackb', + `stackp', and `stacke' and added `failure_stack' to replace them. + Replaced calls to PUSH_FAILURE_POINT with those to + push_failure_point. + (push_failure_point): Added for re_match_2. + (pop_failure_point): Rewrote to use a failure_stack_type of stack. + (can_match_nothing): Moved definition to below re_match_2. + (bcmp_translate): Moved definition to below re_match_2. + +Mon Apr 29 14:20:54 1991 Kathy Hargreaves (kathy at hayley) + + * regex.c (enum regexpcode): Added codes endline_before_newline + and repeated_endline_before_newline so could detect these + types of endlines in the intermediate stages of a compiled + pattern. + (INIT_FAILURE_ALLOC): Renamed NFAILURES to this and set it to 5. + (BUF_PUSH): Put `do {...} while 0' around this. + (BUF_PUSH_2): Defined this to cut down on expansion of EXTEND_BUFFER. + (regex_compile): Changed some comments. + Now push endline_before_newline if find a `$' before a newline + in the pattern. + If a `$' might turn into an ordinary character, set laststart + to point to it. + In '^' case, if syntax bit RE_TIGHT_VBAR is set, then for `^' + to be in a leading position, it must be first in the pattern. + Don't have to check in one of the else clauses that it's not set. + If RE_CONTEXTUAL_INDEP_OPS isn't set but RE_ANCHORS_ONLY_AT_ENDS + is, make '^' a normal character if it isn't first in the pattern. + Can only detect at the end if a '$' after an alternation op is a + trailing one, so can't immediately detect empty alternatives + if a '$' follows a vbar. + Added a picture of the ``success jumps'' in alternatives. + Have to set bufp->used before calling verify_and_adjust_endlines. + Also do it before returning all error strings. + (remove_intervening_anchors): Now replaces the anchor with + repeated_endline_before_newline if it's an endline_before_newline. + (verify_and_adjust_endlines): Deleted SYNTAX parameter (could + use bufp's) and added GROUP_FORWARD_MATCH_STATUS so could + detect back references referring to empty groups. + Added variable `bend' to point past the end of the pattern buffer. + Added variable `previous_p' so wouldn't have to reinspect the + pattern buffer to see what op we just looked at. + Added endline_before_newline and repeated_endline_before_newline + cases. + When checking if in a trailing position, added case where '$' + has to be at the pattern's end if either of the syntax bits + RE_ANCHORS_ONLY_AT_ENDS or RE_TIGHT_VBAR are set. + Since `endline' can have the intermediate form `endline_in_repeat', + have to change it to `endline' if RE_REPEATED_ANCHORS_AWAY + isn't set. + Now disallow empty alternatives with trailing endlines in them + if RE_NO_EMPTY_ALTS is set. + Now don't make '$' an ordinary character if it precedes a newline. + Don't make it an ordinary character if it's before a newline. + Back references now affect the level matching something only if + they refer to nonempty groups. + (can_match_nothing): Now increment p1 in the switch, which + changes many of the cases, but makes the code more like what + it was derived from. + Adjust the return statement to reflect above. + (struct register_info): Made `can_match_nothing' field an int + instead of a bit so could have -1 in it if never set. + (MAX_FAILURE_ITEMS): Changed name from MAX_NUM_FAILURE_ITEMS. + (FAILURE_ITEM_SIZE): Defined how much space a failure items uses. + (PUSH_FAILURE_POINT): Changed variable `last_used_reg's name + to `highest_used_reg'. + Added variable `num_stack_items' and changed `len's name to + `stack_length'. + Test failure stack limit in terms of number of items in it, not + in terms of its length. rms' fix tested length against number + of items, which was a misunderstanding. + Use `realloc' instead of `alloca' to extend the failure stack. + Use shifts instead of multiplying by 2. + (FREE_VARIABLES): Free `stackb' instead of `initial_stack', as + might may have been reallocated. + (re_match_2): When mallocing `initial_stack', now multiply + the number of items wanted (what was there before) by + FAILURE_ITEM_SIZE. + (pop_failure_point): Need this procedure form of the macro of + the same name for debugging, so left it in and deleted the + macro. + (recomp): Don't free the pattern buffer's translate field. + +Mon Apr 15 09:47:47 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h (RE_DUP_MAX): Moved to outside of #ifdef _POSIX_SOURCE. + * regex.c (#include <sys/types.h>): Removed #ifdef _POSIX_SOURCE + condition. + (malloc, realloc): Made return type void* #ifdef __STDC__. + (enum regexpcode): Added endline_in_repeat for the compiler's + use; this never ends up on the final compiled pattern. + (INIT_PATTERN_OFFSETS_LIST_SIZE): Initial size for + pattern_offsets_list_type. + (pattern_offset_type): Type for pattern offsets. + (pattern_offsets_list_type): Type for keeping a list of + pattern offsets. + (anchor_list_type): Changed to above type. + (PATTERN_OFFSETS_LIST_PTR_FULL): Tests if a pattern offsets + list is full. + (ANCHOR_LIST_PTR_FULL): Changed to above. + (BIT_BLOCK_SIZE): Changed to BITS_BLOCK_SIZE and moved to + above bits list routines below regex_compile. + (op_list_type): Defined to be pattern_offsets_list_type. + (compile_stack_type): Changed offsets to be + pattern_offset_type instead of unsigned. + (pointer): Changed the name of all structure fields from this + to `avail'. + (COMPILE_STACK_FULL): Changed so the stack is full if `avail' + is equal to `size' instead of `size' - 1. + (GET_BUFFER_SPACE): Changed `>=' to `>' in the while statement. + (regex_compile): Added variable `enough_memory' so could check + that routine that verifies '$' positions could return an + allocation error. + (group_count): Deleted this variable, as `regnum' already does + this work. + (op_list): Added this variable to keep track of operations + needed for verifying '$' positions. + (anchor_list): Now initialize using routine + `init_pattern_offsets_list'. + Consolidated the three bits_list initializations. + In case '$': Instead of trying to go past constructs which can + follow '$', merely detect the special case where it has to be + at the pattern's end, fix up any fixup jumps if necessary, + record the anchor if necessary and add an `endline' (and + possibly two `no-op's) to the pattern; will call a routine at + the end to verify if it's in a valid position or not. + (init_pattern_offsets_list): Added to initialize pattern + offsets lists. + (extend_anchor_list): Renamed this extend_pattern_offsets_list + and renamed parameters and internal variables appropriately. + (add_pattern_offset): Added this routine which both + record_anchor_position and add_op call. + (adjust_pattern_offsets_list): Add this routine to adjust by + some increment all the pattern offsets a list of such after a + given position. + (record_anchor_position): Now send in offset instead of + calculating it and just call add_pattern_offset. + (adjust_anchor_list): Replaced by above routine. + (remove_intervening_anchors): If the anchor is an `endline' + then replace it with `endline_in_repeat' instead of `no_op'. + (add_op): Added this routine to call in regex_compile + wherever push something relevant to verifying '$' positions. + (verify_and_adjust_endlines): Added routine to (1) verify that + '$'s in a pattern buffer (represented by `endline') were in + valid positions and (2) whether or not they were anchors. + (BITS_BLOCK_SIZE): Renamed BIT_BLOCK_SIZE and moved to right + above bits list routines. + (BITS_BLOCK): Defines which array element of a bits list the + bit corresponding to a given position is in. + (BITS_MASK): Has a 1 where the bit (in a bit list array element) + for a given position is. + +Mon Apr 1 12:09:06 1991 Kathy Hargreaves (kathy at hayley) + + * regex.c (BIT_BLOCK_SIZE): Defined this for using with + bits_list_type, abstracted from level_list_type so could use + for more things than just the level match status. + (regex_compile): Renamed `level_list' variable to + `level_match_status'. + Added variable `group_match_status' of type bits_list_type. + Kept track of whether or not for all groups any of them + matched other than the empty string, so detect if a back + reference in front of a '^' made it nonleading or not. + Do this by setting a match status bit for all active groups + whenever leave a group that matches other than the empty string. + Could detect which groups are active by going through the + stack each time, but or-ing a bits list of active groups with + a bits list of group match status is faster, so make a bits + list of active groups instead. + Have to check that '^' isn't in a leading position before + going to normal_char. + Whenever set level match status of the current level, also set + the match status of all active groups. + Increase the group count and make that group active whenever + open a group. + When close a group, only set the next level down if the + current level matches other than the empty string, and make + the current group inactive. + At a back reference, only set a level's match status if the + group to which the back reference refers matches other than + the empty string. + (init_bits_list): Added to initialize a bits list. + (get_level_value): Deleted this. (Made into + get_level_match_status.) + (extend_bits_list): Added to extend a bits list. (Made this + from deleted routine `extend_level_list'.) + (get_bit): Added to get a bit value from a bits list. (Made + this from deleted routine `get_level_value'.) + (set_bit_to_value): Added to set a bit in a bits list. (Made + this from deleted routine `set_level_value'.) + (get_level_match_status): Added this to get the match status + of a given level. (Made from get_level_value.) + (set_this_level, set_next_lower_level): Made all routines + which set bits extend the bits list if necessary, thus they + now return an unsigned value to indicate whether or not the + reallocation failed. + (increase_level): No longer extends the level list. + (make_group_active): Added to mark as active a given group in + an active groups list. + (make_group_inactive): Added to mark as inactive a given group + in an active groups list. + (set_match_status_of_active_groups): Added to set the match + status of all currently active groups. + (get_group_match_status): Added to get a given group's match status. + (no_levels_match_anything): Removed the paramenter LEVEL. + (PUSH_FAILURE_POINT): Added rms' bug fix and changed RE_NREGS + to num_internal_regs. + +Sun Mar 31 09:04:30 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h (RE_ANCHORS_ONLY_AT_ENDS): Added syntax so could + constrain '^' and '$' to only be anchors if at the beginning + and end of the pattern. + (RE_SYNTAX_POSIX_BASIC): Added the above bit. + + * regex.c (enum regexcode): Changed `unused' to `no_op'. + (this_and_lower_levels_match_nothing): Deleted forward reference. + (regex_compile): case '^': if the syntax bit RE_ANCHORS_ONLY_AT_ENDS + is set, then '^' is only an anchor if at the beginning of the + pattern; only record anchor position if the syntax bit + RE_REPEATED_ANCHORS_AWAY is set; the '^' is a normal char if + the syntax bit RE_ANCHORS_ONLY_AT_END is set and we're not at + the beginning of the pattern (and neither RE_CONTEXTUAL_INDEP_OPS + nor RE_CONTEXTUAL_INDEP_OPS syntax bits are set). + Only adjust the anchor list if the syntax bit + RE_REPEATED_ANCHORS_AWAY is set. + + * regex.c (level_list_type): Use to detect when '^' is + in a leading position. + (regex_compile): Added level_list_type level_list variable in + which we keep track of whether or not a grouping level (in its + current or most recent incarnation) matches anything besides the + empty string. Set the bit for the i-th level when detect it + should match something other than the empty string and the bit + for the (i-1)-th level when leave the i-th group. Clear all + bits for the i-th and higher levels if none of 0--(i - 1)-th's + bits are set when encounter an alternation operator on that + level. If no levels are set when hit a '^', then it is in a + leading position. We keep track of which level we're at by + increasing a variable current_level whenever we encounter an + open-group operator and decreasing it whenever we encounter a + close-group operator. + Have to adjust the anchor list contents whenever insert + something ahead of them (such as on_failure_jump's) in the + pattern. + (adjust_anchor_list): Adjusts the offsets in an anchor list by + a given increment starting at a given start position. + (get_level_value): Returns the bit setting of a given level. + (set_level_value): Sets the bit of a given level to a given value. + (set_this_level): Sets (to 1) the bit of a given level. + (set_next_lower_level): Sets (to 1) the bit of (LEVEL - 1) for a + given LEVEL. + (clear_this_and_higher_levels): Clears the bits for a given + level and any higher levels. + (extend_level_list): Adds sizeof(unsigned) more bits to a level list. + (increase_level): Increases by 1 the value of a given level variable. + (decrease_level): Decreases by 1 the value of a given level variable. + (lower_levels_match_nothing): Checks if any levels lower than + the given one match anything. + (no_levels_match_anything): Checks if any levels match anything. + (re_match_2): At case wordbeg: before looking at d-1, check that + we're not at the string's beginning. + At case wordend: Added some illuminating parentheses. + +Mon Mar 25 13:58:51 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h (RE_NO_ANCHOR_AT_NEWLINE): Changed syntax bit name + from RE_ANCHOR_NOT_NEWLINE because an anchor never matches the + newline itself, just the empty string either before or after it. + (RE_REPEATED_ANCHORS_AWAY): Added this syntax bit for ignoring + anchors inside groups which are operated on by repetition + operators. + (RE_DOT_MATCHES_NEWLINE): Added this bit so the match-any-character + operator could match a newline when it's set. + (RE_SYNTAX_POSIX_BASIC): Set RE_DOT_MATCHES_NEWLINE in this. + (RE_SYNTAX_POSIX_EXTENDED): Set RE_DOT_MATCHES_NEWLINE and + RE_REPEATED_ANCHORS_AWAY in this. + (regerror): Changed prototypes to new POSIX spec. + + * regex.c (anchor_list_type): Added so could null out anchors inside + repeated groups. + (ANCHOR_LIST_PTR_FULL): Added for above type. + (compile_stack_element): Changed name from stack_element. + (compile_stack_type): Changed name from compile_stack. + (INIT_COMPILE_STACK_SIZE): Changed name from INIT_STACK_SIZE. + (COMPILE_STACK_EMPTY): Changed name from STACK_EMPTY. + (COMPILE_STACK_FULL): Changed name from STACK_FULL. + (regex_compile): Changed SYNTAX parameter to non-const. + Changed variable name `stack' to `compile_stack'. + If syntax bit RE_REPEATED_ANCHORS_AWAY is set, then naively put + anchors in a list when encounter them and then set them to + `unused' when detect they are within a group operated on by a + repetition operator. Need something more sophisticated than + this, as they should only get set to `unused' if they are in + positions where they would be anchors. Also need a better way to + detect contextually invalid anchors. + Changed some commments. + (is_in_compile_stack): Changed name from `is_in_stack'. + (extend_anchor_list): Added to do anchor stuff. + (record_anchor_position): Added to do anchor stuff. + (remove_intervening_anchors): Added to do anchor stuff. + (re_match_2): Now match a newline with the match-any-character + operator if RE_DOT_MATCHES_NEWLINE is set. + Compacted some code. + (regcomp): Added new POSIX newline information to the header + commment. + If REG_NEWLINE cflag is set, then now unset RE_DOT_MATCHES_NEWLINE + in syntax. + (put_in_buffer): Added to do new POSIX regerror spec. Called + by regerror. + (regerror): Changed to take a pattern buffer, error buffer and + its size, and return type `size_t', the size of the full error + message, and the first ERRBUF_SIZE - 1 characters of the full + error message in the error buffer. + +Wed Feb 27 16:38:33 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h (#include <sys/types.h>): Removed this as new POSIX + standard has the user include it. + (RE_SYNTAX_POSIX_BASIC and RE_SYNTAX_POSIX_EXTENDED): Removed + RE_HAT_LISTS_NOT_NEWLINE as new POSIX standard has the cflag + REG_NEWLINE now set this. Similarly, added syntax bit + RE_ANCHOR_NOT_NEWLINE as this is now unset by REG_NEWLINE. + (RE_SYNTAX_POSIX_BASIC): Removed syntax bit + RE_NO_CONSECUTIVE_REPEATS as POSIX now allows them. + + * regex.c (#include <sys/types.h>): Added this as new POSIX + standard has the user include it instead of us putting it in + regex.h. + (extern char *re_syntax_table): Made into an extern so the + user could allocate it. + (DO_RANGE): If don't find a range end, now goto invalid_range_end + instead of unmatched_left_bracket. + (regex_compile): Made variable SYNTAX non-const.???? + Reformatted some code. + (re_compile_fastmap): Moved is_a_succeed_n's declaration to + inner braces. + Compacted some code. + (SET_NEWLINE_FLAG): Removed and put inline. + (regcomp): Made variable `syntax' non-const so can unset + RE_ANCHOR_NOT_NEWLINE syntax bit if cflag RE_NEWLINE is set. + If cflag RE_NEWLINE is set, set the RE_HAT_LISTS_NOT_NEWLINE + syntax bit and unset RE_ANCHOR_NOT_NEWLINE one of `syntax'. + +Wed Feb 20 16:33:38 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h (RE_NO_CONSECUTIVE_REPEATS): Changed name from + RE_NO_CONSEC_REPEATS. + (REG_ENESTING): Deleted this POSIX return value, as the stack + is now unbounded. + (struct re_pattern_buffer): Changed some comments. + (re_compile_pattern): Changed a comment. + Deleted check on stack upper bound and corresponding error. + Now when there's no interval contents and it's the end of the + pattern, go to unmatched_left_curly_brace instead of end_of_pattern. + Removed nesting_too_deep error, as the stack is now unbounded. + (regcomp): Removed REG_ENESTING case, as the stack is now unbounded. + (regerror): Removed REG_ENESTING case, as the stack is now unbounded. + + * regex.c (MAX_STACK_SIZE): Deleted because don't need upper + bound on array indexed with an unsigned number. + +Sun Feb 17 15:50:24 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h: Changed and added some comments. + + * regex.c (init_syntax_once): Made `_' a word character. + (re_compile_pattern): Added a comment. + (re_match_2): Redid header comment. + (regexec): With header comment about PMATCH, corrected and + removed details found regex.h, adding a reference. + +Fri Feb 15 09:21:31 1991 Kathy Hargreaves (kathy at hayley) + + * regex.c (DO_RANGE): Removed argument parentheses. + Now get untranslated range start and end characters and set + list bits for the translated (if at all) versions of them and + all characters between them. + (re_match_2): Now use regs->num_regs instead of num_regs_wanted + wherever possible. + (regcomp): Now build case-fold translate table using isupper + and tolower facilities so will work on foreign language characters. + +Sat Feb 9 16:40:03 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h (RE_HAT_LISTS_NOT_NEWLINE): Changed syntax bit name + from RE_LISTS_NOT_NEWLINE as it only affects nonmatching lists. + Changed all references to the match-beginning-of-string + operator to match-beginning-of-line operator, as this is what + it does. + (RE_NO_CONSEC_REPEATS): Added this syntax bit. + (RE_SYNTAX_POSIX_BASIC): Added above bit to this. + (REG_PREMATURE_END): Changed name to REG_EEND. + (REG_EXCESS_NESTING): Changed name to REG_ENESTING. + (REG_TOO_BIG): Changed name to REG_ESIZE. + (REG_INVALID_PREV_RE): Deleted this return POSIX value. + Added and changed some comments. + + * regex.c (re_compile_pattern): Now sets the pattern buffer's + `return_default_num_regs' field. + (typedef struct stack_element, stack_type, INIT_STACK_SIZE, + MAX_STACK_SIZE, STACK_EMPTY, STACK_FULL): Added for regex_compile. + (INIT_BUF_SIZE): Changed value from 28 to 32. + (BUF_PUSH): Changed name from BUFPUSH. + (MAX_BUF_SIZE): Added so could use in many places. + (IS_CHAR_CLASS_STRING): Replaced is_char_class with this. + (regex_compile): Added a stack which could grow dynamically + and which has struct elements. + Go back to initializing `zero_times_ok' and `many_time_ok' to + 0 and |=ing them inside the loop. + Now disallow consecutive repetition operators if the syntax + bit RE_NO_CONSEC_REPEATS is set. + Now detect trailing backslash when the compiler is expecting a + `?' or a `+'. + Changed calls to GET_BUFFER_SPACE which asked for 6 to ask for + 3, as that's all they needed. + Now check for trailing backslash inside lists. + Now disallow an empty alternative right before an end-of-line + operator. + Now get buffer space before leaving space for a fixup jump. + Now check if at pattern end when at open-interval operator. + Added some comments. + Now check if non-interval repetition operators follow an + interval one if the syntax bit RE_NO_CONSEC_REPEATS is set. + Now only check if what precedes an interval repetition + operator isn't a regular expression which matches one + character if the syntax bit RE_NO_CONSEC_REPEATS is set. + Now return "Unmatched [ or [^" instead of "Unmatched [". + (is_in_stack): Added to check if a given register number is in + the stack. + (re_match_2): If initial variable allocations fail, return -2, + instead of -1. + Now set reg's `num_regs' field when allocating regs. + Now before allocating them, free regs->start and end if they + aren't NULL and return -2 if either allocation fails. + Now use regs->num_regs instead of num_regs_wanted to control + regs loops. + Now increment past the newline when matching it with an + end-of-line operator. + (recomp): Added to the header comment. + Now return REG_ESUBREG if regex_compile returns "Unmatched [ + or [^" instead of doing so if it returns "Unmatched [". + Now return REG_BADRPT if in addition to returning "Missing + preceding regular expression", regex_compile returns "Invalid + preceding regular expression". + Now return new return value names (see regex.h changes). + (regexec): Added to header comment. + Initialize regs structure. + Now match whole string. + Now always free regs.start and regs.end instead of just when + the string matched. + (regerror): Now return "Regex error: Unmatched [ or [^.\n" + instead of "Regex error: Unmatched [.\n". + Now return "Regex error: Preceding regular expression either + missing or not simple.\n" instead of "Regex error: Missing + preceding regular expression.\n". + Removed REG_INVALID_PREV_RE case (it got subsumed into the + REG_BADRPT case). + +Thu Jan 17 09:52:35 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h: Changed a comment. + + * regex.c: Changed and added large header comments. + (re_compile_pattern): Now if detect that `laststart' for an + interval points to a byte code for a regular expression which + matches more than one character, make it an internal error. + (regerror): Return error message, don't print it. + +Tue Jan 15 15:32:49 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h (regcomp return codes): Added GNU ones. + Updated some comments. + + * regex.c (DO_RANGE): Changed `obscure_syntax' to `syntax'. + (regex_compile): Added `following_left_brace' to keep track of + where pseudo interval following a valid interval starts. + Changed some instances that returned "Invalid regular + expression" to instead return error strings coinciding with + POSIX error codes. + Changed some comments. + Now consider only things between `[:' and `:]' to be possible + character class names. + Now a character class expression can't end a pattern; at + least a `]' must close the list. + Now if the syntax bit RE_NO_BK_CURLY_BRACES is set, then a + valid interval must be followed by yet another to get an error + for preceding an interval (in this case, the second one) with + a regular expression that matches more than one character. + Now if what follows a valid interval begins with a open + interval operator but doesn't begin a valid interval, then set + following_left_bracket to it, put it in C and go to + normal_char label. + Added some comments. + Return "Invalid character class name" instead of "Invalid + character class". + (regerror): Return messages for all POSIX error codes except + REG_ECOLLATE and REG_NEWLINE, along with all GNU error codes. + Added `break's after all cases. + (main): Call re_set_syntax instead of setting `obscure_syntax' + directly. + +Sat Jan 12 13:37:59 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h (Copyright): Updated date. + (#include <sys/types.h>): Include unconditionally. + (RE_CANNOT_MATCH_NEWLINE): Deleted this syntax bit. + (RE_SYNTAX_POSIX_BASIC, RE_SYNTAX_POSIX_EXTENDED): Removed + setting the RE_ANCHOR_NOT_NEWLINE syntax bit from these. + Changed and added some comments. + (struct re_pattern_buffer): Changed some flags from chars to bits. + Added field `syntax'; holds which syntax pattern was compiled with. + Added bit flag `return_default_num_regs'. + (externs for GNU and Berkeley UNIX routines): Added `const's to + parameter types to be compatible with POSIX. + (#define const): Added to support old C compilers. + + * regex.c (Copyright): Updated date. + (enum regexpcode): Deleted `newline'. + (regex_compile): Renamed re_compile_pattern to this, added a + syntax parameter so it can set the pattern buffer's `syntax' + field. + Made `pattern', and `size' `const's so could pass to POSIX + interface routines; also made `const' whatever interval + variables had to be to make this work. + Changed references to `obscure_syntax' to new parameter `syntax'. + Deleted putting `newline' in buffer when see `\n'. + Consider invalid character classes which have nothing wrong + except the character class name; if so, return character-class error. + (is_char_class): Added routine for regex_compile. + (re_compile_pattern): added a new one which calls + regex_compile with `obscure_syntax' as the actual parameter + for the formal `syntax'. + Gave this the old routine's header comments. + Made `pattern', and `size' `const's so could use POSIX interface + routine parameters. + (re_search, re_search_2, re_match, re_match_2): Changed + `pbufp' to `bufp'. + (re_search_2, re_match_2): Changed `mstop' to `stop'. + (re_search, re_search_2): Made all parameters except `regs' + `const's so could use POSIX interface routines parameters. + (re_search_2): Added private copies of `const' parameters so + could change their values. + (re_match_2): Made all parameters except `regs' `const's so + could use POSIX interface routines parameters. + Changed `size1' and `size2' parameters to `size1_arg' and + `size2_arg' and so could change; added local `size1' and + `size2' and set to these. + Added some comments. + Deleted `newline' case. + `begline' can also possibly match if `d' contains a newline; + if it does, we have to increment d to point past the newline. + Replaced references to `obscure_syntax' with `bufp->syntax'. + (re_comp, re_exec): Made parameter `s' a `const' so could use POSIX + interface routines parameters. + Now call regex_compile, passing `obscure_syntax' via the + `syntax' parameter. + (re_exec): Made local `len' a `const' so could pass to re_search. + (regcomp): Added header comment. + Added local `syntax' to set and pass to regex_compile rather + than setting global `obscure_syntax' and passing it. + Call regex_compile with its `syntax' parameter rather than + re_compile_pattern. + Return REG_ECTYPE if character-class error. + (regexec): Don't initialize `regs' to anything. + Made `private_preg' a nonpointer so could set to what the + constant `preg' points. + Initialize `private_preg's `return_default_num_regs' field to + zero because want to return `nmatch' registers, not however + many there are subexpressions in the pattern. + Also test if `nmatch' > 0 to see if should pass re_match `regs'. + +Tue Jan 8 15:57:17 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h (struct re_pattern_buffer): Reworded comment. + + * regex.c (EXTEND_BUFFER): Also reset beg_interval. + (re_search_2): Return val if val = -2. + (NUM_REG_ITEMS): Listed items in comment. + (NUM_OTHER_ITEMS): Defined this for using in > 1 definition. + (MAX_NUM_FAILURE_ITEMS): Replaced `+ 2' with NUM_OTHER_ITEMS. + (NUM_FAILURE_ITEMS): As with definition above and added to + comment. + (PUSH_FAILURE_POINT): Replaced `* 2's with `<< 1's. + (re_match_2): Test with equality with 1 to see pbufp->bol and + pbufp->eol are set. + +Fri Jan 4 15:07:22 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h (struct re_pattern_buffer): Reordered some fields. + Updated some comments. + Added not_bol and not_eol fields. + (extern regcomp, regexec, regerror): Added return types. + (extern regfree): Added `extern'. + + * regex.c (min): Deleted unused macro. + (re_match_2): Compacted some code. + Removed call to macro `min' from `for' loop. + Fixed so unused registers get filled with -1's. + Fail if the pattern buffer's `not_bol' field is set and + encounter a `begline'. + Fail if the pattern buffer's `not_eol' field is set and + encounter a `endline'. + Deleted redundant check for empty stack in fail case. + Don't free pattern buffer's components in re_comp. + (regexec): Initialize variable regs. + Added `private_preg' pattern buffer so could set `not_bol' and + `not_eol' fields and hand to re_match. + Deleted naive attempt to detect anchors. + Set private pattern buffer's `not_bol' and `not_eol' fields + according to eflags value. + `nmatch' must also be > 0 for us to bother allocating + registers to send to re_match and filling pmatch + with their results after the call to re_match. + Send private pattern buffer instead of argument to re_match. + If use the registers, always free them and then set them to NULL. + (regerror): Added this Posix routine. + (regfree): Added this Posix routine. + +Tue Jan 1 15:02:45 1991 Kathy Hargreaves (kathy at hayley) + + * regex.h (RE_NREGS): Deleted this definition, as now the user + can choose how many registers to have. + (REG_NOTBOL, REG_NOTEOL): Defined these Posix eflag bits. + (REG_NOMATCH, REG_BADPAT, REG_ECOLLATE, REG_ECTYPE, + REG_EESCAPE, REG_ESUBREG, REG_EBRACK, REG_EPAREN, REG_EBRACE, + REG_BADBR, REG_ERANGE, REG_ESPACE, REG_BADRPT, REG_ENEWLINE): + Defined these return values for Posix's regcomp and regexec. + Updated some comments. + (struct re_pattern_buffer): Now typedef this as regex_t + instead of the other way around. + (struct re_registers): Added num_regs field. Made start and + end fields pointers to char instead of fixed size arrays. + (regmatch_t): Added this Posix register type. + (regcomp, regexec, regerror, regfree): Added externs for these + Posix routines. + + * regex.c (enum boolean): Typedefed this. + (re_pattern_buffer): Reformatted some comments. + (re_compile_pattern): Updated some comments. + Always push start_memory and its attendant number whenever + encounter a group, not just when its number is less than the + previous maximum number of registers; same for stop_memory. + Get 4 bytes of buffer space instead of 2 when pushing a + set_number_at. + (can_match_nothing): Added this to elaborate on and replace + code in re_match_2. + (reg_info_type): Made can_match_nothing field a bit instead of int. + (MIN): Added for re_match_2. + (re_match_2 macros): Changed all `for' loops which used + RE_NREGS to now use num_internal_regs as upper bounds. + (MAX_NUM_FAILURE_ITEMS): Use num_internal_regs instead of RE_NREGS. + (POP_FAILURE_POINT): Added check for empty stack. + (FREE_VARIABLES): Added this to free (and set to NULL) + variables allocated in re_match_2. + (re_match_2): Rearranged parameters to be in order. + Added variables num_regs_wanted (how many registers the user wants) + and num_internal_regs (how many groups there are). + Allocated initial_stack, regstart, regend, old_regstart, + old_regend, reginfo, best_regstart, and best_regend---all + which used to be fixed size arrays. Free them all and return + -1 if any fail. + Free above variables if starting position pos isn't valid. + Changed all `for' loops which used RE_NREGS to now use + num_internal_regs as upper bounds---except for the loops which + fill regs; then use num_regs_wanted. + Allocate regs if the user has passed it and wants more than 0 + registers filled. + Set regs->start[i] and regs->end[i] to -1 if either + regstart[i] or regend[i] equals -1, not just the first. + Free allocated variables before returning. + Updated some comments. + (regcomp): Return REG_ESPACE, REG_BADPAT, REG_EPAREN when + appropriate. + Free translate array. + (regexec): Added this Posix interface routine. + +Mon Dec 24 14:21:13 1990 Kathy Hargreaves (kathy at hayley) + + * regex.h: If _POSIX_SOURCE is defined then #include <sys/types.h>. + Added syntax bit RE_CANNOT_MATCH_NEWLINE. + Defined Posix cflags: REG_EXTENDED, REG_NEWLINE, REG_ICASE, and + REG_NOSUB. + Added fields re_nsub and no_sub to struct re_pattern_buffer. + Typedefed regex_t to be `struct re_pattern_buffer'. + + * regex.c (CHAR_SET_SIZE): Defined this to be 256 and replaced + incidences of this value with this constant. + (re_compile_pattern): Added switch case for `\n' and put + `newline' into the pattern buffer when encounter this. + Increment the pattern_buffer's `re_nsub' field whenever open a + group. + (re_match_2): Match a newline with `newline'---provided the + syntax bit RE_CANNOT_MATCH_NEWLINE isn't set. + (regcomp): Added this Posix interface routine. + (enum test_type): Added interface_test tag. + (main): Added Posix interface test. + +Tue Dec 18 12:58:12 1990 Kathy Hargreaves (kathy at hayley) + + * regex.h (struct re_pattern_buffer): reformatted so would fit + in texinfo documentation. + +Thu Nov 29 15:49:16 1990 Kathy Hargreaves (kathy at hayley) + + * regex.h (RE_NO_EMPTY_ALTS): Added this bit. + (RE_SYNTAX_POSIX_EXTENDED): Added above bit. + + * regex.c (re_compile_pattern): Disallow empty alternatives only + when RE_NO_EMPTY_ALTS is set, not when RE_CONTEXTUAL_INVALID_OPS is. + Changed RE_NO_BK_CURLY_BRACES to RE_NO_BK_PARENS when testing + for empty groups at label handle_open. + At label handle_bar: disallow empty alternatives if RE_NO_EMPTY_ALTS + is set. + Rewrote some comments. + + (re_compile_fastmap): cleaned up code. + + (re_search_2): Rewrote comment. + + (struct register_info): Added field `inner_groups'; it records + which groups are inside of the current one. + Added field can_match_nothing; it's set if the current group + can match nothing. + Added field ever_match_something; it's set if current group + ever matched something. + + (INNER_GROUPS): Added macro to access inner_groups field of + struct register_info. + + (CAN_MATCH_NOTHING): Added macro to access can_match_nothing + field of struct register_info. + + (EVER_MATCHED_SOMETHING): Added macro to access + ever_matched_something field of struct register_info. + + (NOTE_INNER_GROUP): Defined macro to record that a given group + is inside of all currently active groups. + + (re_match_2): Added variables *p1 and mcnt2 (multipurpose). + Added old_regstart and old_regend arrays to hold previous + register values if they need be restored. + Initialize added fields and variables. + case start_memory: Find out if the group can match nothing. + Save previous register values in old_restart and old_regend. + Record that current group is inside of all currently active + groups. + If the group is inside a loop and it ever matched anything, + restore its registers to values before the last failed match. + Restore the registers for the inner groups, too. + case duplicate: Can back reference to a group that never + matched if it can match nothing. + +Thu Nov 29 11:12:54 1990 Karl Berry (karl at hayley) + + * regex.c (bcopy, ...): define these if either _POSIX_SOURCE or + STDC_HEADERS is defined; same for including <stdlib.h>. + +Sat Oct 6 16:04:55 1990 Kathy Hargreaves (kathy at hayley) + + * regex.h (struct re_pattern_buffer): Changed field comments. + + * regex.c (re_compile_pattern): Allow a `$' to precede an + alternation operator (`|' or `\|'). + Disallow `^' and/or `$' in empty groups if the syntax bit + RE_NO_EMPTY_GROUPS is set. + Wait until have parsed a valid `\{...\}' interval expression + before testing RE_CONTEXTUAL_INVALID_OPS to see if it's + invalidated by that. + Don't use RE_NO_BK_CURLY_BRACES to test whether or not a validly + parsed interval expression is invalid if it has no preceding re; + rather, use RE_CONTEXTUAL_INVALID_OPS. + If an interval parses, but there is no preceding regular + expression, yet the syntax bit RE_CONTEXTUAL_INDEP_OPS is set, + then that interval can match the empty regular expression; if + the bit isn't set, then the characters in the interval + expression are parsed as themselves (sans the backslashes). + In unfetch_interval case: Moved PATFETCH to above the test for + RE_NO_BK_CURLY_BRACES being set, which would force a goto + normal_backslash; the code at both normal_backsl and normal_char + expect a character in `c.' + +Sun Sep 30 11:13:48 1990 Kathy Hargreaves (kathy at hayley) + + * regex.h: Changed some comments to use the terms used in the + documentation. + (RE_CONTEXTUAL_INDEP_OPS): Changed name from `RE_CONTEXT_INDEP_OPS'. + (RE_LISTS_NOT_NEWLINE): Changed name from `RE_HAT_NOT_NEWLINE.' + (RE_ANCHOR_NOT_NEWLINE): Added this syntax bit. + (RE_NO_EMPTY_GROUPS): Added this syntax bit. + (RE_NO_HYPHEN_RANGE_END): Deleted this syntax bit. + (RE_SYNTAX_...): Reformatted. + (RE_SYNTAX_POSIX_BASIC, RE_SYNTAX_EXTENDED): Added syntax bits + RE_ANCHOR_NOT_NEWLINE and RE_NO_EMPTY_GROUPS, and deleted + RE_NO_HYPHEN_RANGE_END. + (RE_SYNTAX_POSIX_EXTENDED): Added syntax bit RE_DOT_NOT_NULL. + + * regex.c (bcopy, bcmp, bzero): Define if _POSIX_SOURCE is defined. + (_POSIX_SOURCE): ifdef this, #include <stdlib.h> + (#ifdef emacs): Changed comment of the #endif for the its #else + clause to be `not emacs', not `emacs.' + (no_pop_jump): Changed name from `jump'. + (pop_failure_jump): Changed name from `finalize_jump.' + (maybe_pop_failure_jump): Changed name from `maybe_finalize_jump'. + (no_pop_jump_n): Changed name from `jump_n.' + (EXTEND_BUFFER): Use shift instead of multiplication to double + buf->allocated. + (DO_RANGE, recompile_pattern): Added macro to set the list bits + for a range. + (re_compile_pattern): Fixed grammar problems in some comments. + Checked that RE_NO_BK_VBAR is set to make `$' valid before a `|' + and not set to make it valid before a `\|'. + Checked that RE_NO_BK_PARENS is set to make `$' valid before a ')' + and not set to make it valid before a `\)'. + Disallow ranges starting with `-', unless the range is the + first item in a list, rather than disallowing ranges which end + with `-'. + Disallow empty groups if the syntax bit RE_NO_EMPTY_GROUPS is set. + Disallow nothing preceding `{' and `\{' if they represent the + open-interval operator and RE_CONTEXTUAL_INVALID_OPS is set. + (register_info_type): typedef-ed this using `struct register_info.' + (SET_REGS_MATCHED): Compacted the code. + (re_match_2): Made it fail if back reference a group which we've + never matched. + Made `^' not match a newline if the syntax bit + RE_ANCHOR_NOT_NEWLINE is set. + (really_fail): Added this label so could force a final fail that + would not try to use the failure stack to recover. + +Sat Aug 25 14:23:01 1990 Kathy Hargreaves (kathy at hayley) + + * regex.h (RE_CONTEXTUAL_OPS): Changed name from RE_CONTEXT_OPS. + (global): Rewrote comments and rebroke some syntax #define lines. + + * regex.c (isgraph): Added definition for sequents. + (global): Now refer to character set lists as ``lists.'' + Rewrote comments containing ``\('' or ``\)'' to now refer to + ``groups.'' + (RE_CONTEXTUAL_OPS): Changed name from RE_CONTEXT_OPS. + + (re_compile_pattern): Expanded header comment. + +Sun Jul 15 14:50:25 1990 Kathy Hargreaves (kathy at hayley) + + * regex.h (RE_CONTEX_INDEP_OPS): the comment's sense got turned + around when we changed how it read; changed it to be correct. + +Sat Jul 14 16:38:06 1990 Kathy Hargreaves (kathy at hayley) + + * regex.h (RE_NO_EMPTY_BK_REF): changed name to + RE_NO_MISSING_BK_REF, as this describes it better. + + * regex.c (re_compile_pattern): changed RE_NO_EMPTY_BK_REF + to RE_NO_MISSING_BK_REF, as above. + +Thu Jul 12 11:45:05 1990 Kathy Hargreaves (kathy at hayley) + + * regex.h (RE_NO_EMPTY_BRACKETS): removed this syntax bit, as + bracket expressions should *never* be empty regardless of the + syntax. Removes this bit from RE_SYNTAX_POSIX_BASIC and + RE_SYNTAX_POSIX_EXTENDED. + + * regex.c (SET_LIST_BIT): in the comment, now refer to character + sets as (non)matching sets, as bracket expressions can now match + other things in addition to characters. + (re_compile_pattern): refer to groups as such instead of `\(...\)' + or somesuch, because groups can now be enclosed in either plain + parens or backslashed ones, depending on the syntax. + In the '[' case, added a boolean just_had_a_char_class to detect + whether or not a character class begins a range (which is invalid). + Restore way of breaking out of a bracket expression to original way. + Add way to detect a range if the last thing in a bracket + expression was a character class. + Took out check for c != ']' at the end of a character class in + the else clause, as it had already been checked in the if part + that also checked the validity of the string. + Set or clear just_had_a_char_class as appropriate. + Added some comments. Changed references to character sets to + ``(non)matching lists.'' + +Sun Jul 1 12:11:29 1990 Karl Berry (karl at hayley) + + * regex.h (BYTEWIDTH): moved back to regex.c. + + * regex.h (re_compile_fastmap): removed declaration; this + shouldn't be advertised. + +Mon May 28 15:27:53 1990 Kathy Hargreaves (kathy at hayley) + + * regex.c (ifndef Sword): Made comments more specific. + (global): include <stdio.h> so can write fatal messages on + standard error. Replaced calls to assert with fprintfs to + stderr and exit (1)'s. + (PREFETCH): Reformatted to make more readable. + (AT_STRINGS_BEG): Defined to test if we're at the beginning of + the virtual concatenation of string1 and string2. + (AT_STRINGS_END): Defined to test if at the end of the virtual + concatenation of string1 and string2. + (AT_WORD_BOUNDARY): Defined to test if are at a word boundary. + (IS_A_LETTER(d)): Defined to test if the contents of the pointer D + is a letter. + (re_match_2): Rewrote the wordbound, notwordbound, wordbeg, wordend, + begbuf, and endbuf cases in terms of the above four new macros. + Called SET_REGS_MATCHED in the matchsyntax, matchnotsyntax, + wordchar, and notwordchar cases. + +Mon May 14 14:49:13 1990 Kathy Hargreaves (kathy at hayley) + + * regex.c (re_search_2): Fixed RANGE to not ever take STARTPOS + outside of virtual concatenation of STRING1 and STRING2. + Updated header comment as to this. + (re_match_2): Clarified comment about MSTOP in header. + +Sat May 12 15:39:00 1990 Kathy Hargreaves (kathy at hayley) + + * regex.c (re_search_2): Checked for out-of-range STARTPOS. + Added comments. + When searching backwards, not only get the character with which + to compare to the fastmap from string2 if the starting position + >= size1, but also if size1 is zero; this is so won't get a + segmentation fault if string1 is null. + Reformatted code at label advance. + +Thu Apr 12 20:26:21 1990 Kathy Hargreaves (kathy at hayley) + + * regex.h: Added #pragma once and #ifdef...endif __REGEXP_LIBRARY. + (RE_EXACTN_VALUE): Added for search.c to use. + Reworded some comments. + + regex.c: Punctuated some comments correctly. + (NULL): Removed this. + (RE_EXACTN_VALUE): Added for search.c to use. + (<ctype.h>): Moved this include to top of file. + (<assert.h>): Added this include. + (struct regexpcode): Assigned 0 to unused and 1 to exactn + because of RE_EXACTN_VALUE. + Added comment. + (various macros): Lined up backslashes near end of line. + (insert_jump): Cleaned up the header comment. + (re_search): Corrected the header comment. + (re_search_2): Cleaned up and completed the header comment. + (re_max_failures): Updated comment. + (struct register_info): Constructed as bits so as to save space + on the stack when pushing register information. + (IS_ACTIVE): Macro for struct register_info. + (MATCHED_SOMETHING): Macro for struct register_info. + (NUM_REG_ITEMS): How many register information items for each + register we have to push on the stack at each failure. + (MAX_NUM_FAILURE_ITEMS): If push all the registers on failure, + this is how many items we push on the stack. + (PUSH_FAILURE_POINT): Now pushes whether or not the register is + currently active, and whether or not it matched something. + Checks that there's enough space allocated to accomodate all the + items we currently want to push. (Before, a test for an empty + stack sufficed because we always pushed and popped the same + number of items). + Replaced ``2'' with MAX_NUM_FAILURE_POINTS when ``2'' refers + to how many things get pushed on the stack each time. + When copy the stack into the newly allocated storage, now only copy + the area in use. + Clarified comment. + (POP_FAILURE_POINT): Defined to use in places where put number + of registers on the stack into a variable before using it to + decrement the stack, so as to not confuse the compiler. + (IS_IN_FIRST_STRING): Defined to check if a pointer points into + the first string. + (SET_REGS_MATCHED): Changed to use the struct register_info + bits; also set the matched-something bit to false if the + register isn't currently active. (This is a redundant setting.) + (re_match_2): Cleaned up and completed the header comment. + Updated the failure stack comment. + Replaced the ``2'' with MAX_NUM_FAILURE_ITEMS in the static + allocation of initial_stack, because now more than two (now up + to MAX_FAILURE_ITEMS) items get pushed on the failure stack each + time. + Ditto for stackb. + Trashed restart_seg1, regend_seg1, best_regstart_seg1, and + best_regend_seg1 because they could have erroneous information + in them, such as when matching ``a'' (in string1) and ``ab'' (in + string2) with ``(a)*ab''; before using IS_IN_FIRST_STRING to see + whether or not the register starts or ends in string1, + regstart[1] pointed past the end of string1, yet regstart_seg1 + was 0! + Added variable reg_info of type struct register_info to keep + track of currently active registers and whether or not they + currently match anything. + Commented best_regs_set. + Trashed reg_active and reg_matched_something and put the + information they held into reg_info; saves space on the stack. + Replaced NULL with '\000'. + In begline case, compacted the code. + Used assert to exit if had an internal error. + In begbuf case, because now force the string we're working on + into string2 if there aren't two strings, now allow d == string2 + if there is no string1 (and the check for that is size1 == 0!); + also now succeeds if there aren't any strings at all. + (main, ifdef canned): Put test type into a variable so could + change it while debugging. + +Sat Mar 24 12:24:13 1990 Kathy Hargreaves (kathy at hayley) + + * regex.c (GET_UNSIGNED_NUMBER): Deleted references to num_fetches. + (re_compile_pattern): Deleted num_fetches because could keep + track of the number of fetches done by saving a pointer into the + pattern. + Added variable beg_interval to be used as a pointer, as above. + Assert that beg_interval points to something when it's used as above. + Initialize succeed_n's to lower_bound because re_compile_fastmap + needs to know it. + (re_compile_fastmap): Deleted unnecessary variable is_a_jump_n. + Added comment. + (re_match_2): Put number of registers on the stack into a + variable before using it to decrement the stack, so as to not + confuse the compiler. + Updated comments. + Used error routine instead of printf and exit. + In exactn case, restored longer code from ``original'' regex.c + which doesn't test translate inside a loop. + + * regex.h: Moved #define NULL and the enum regexpcode definition + and to regex.c. Changed some comments. + + regex.c (global): Updated comments about compiling and for the + re_compile_pattern jump routines. + Added #define NULL and the enum regexpcode definition (from + regex.h). + (enum regexpcode): Added set_number_at to reset the n's of + succeed_n's and jump_n's. + (re_set_syntax): Updated its comment. + (re_compile_pattern): Moved its heading comment to after its macros. + Moved its include statement to the top of the file. + Commented or added to comments of its macros. + In start_memory case: Push laststart value before adding + start_memory and its register number to the buffer, as they + might not get added. + Added code to put a set_number_at before each succeed_n and one + after each jump_n; rewrote code in what seemed a more + straightforward manner to put all these things in the pattern so + the succeed_n's would correctly jump to the set_number_at's of + the matching jump_n's, and so the jump_n's would correctly jump + to after the set_number_at's of the matching succeed_n's. + Initialize succeed_n n's to -1. + (insert_op_2): Added this to insert an operation followed by + two integers. + (re_compile_fastmap): Added set_number_at case. + (re_match_2): Moved heading comment to after macros. + Added mention of REGS to heading comment. + No longer turn a succeed_n with n = 0 into an on_failure_jump, + because n needs to be reset each time through a loop. + Check to see if a succeed_n's n is set by its set_number_at. + Added set_number_at case. + Updated some comments. + (main): Added another main to run posix tests, which is compiled + ifdef both test and canned. (Old main is still compiled ifdef + test only). + +Tue Mar 19 09:22:55 1990 Kathy Hargreaves (kathy at hayley) + + * regex.[hc]: Change all instances of the word ``legal'' to + ``valid'' and all instances of ``illegal'' to ``invalid.'' + +Sun Mar 4 12:11:31 1990 Kathy Hargreaves (kathy at hayley) + + * regex.h: Added syntax bit RE_NO_EMPTY_RANGES which is set if + an ending range point has to collate higher or equal to the + starting range point. + Added syntax bit RE_NO_HYPHEN_RANGE_END which is set if a hyphen + can't be an ending range point. + Set to two above bits in RE_SYNTAX_POSIX_BASIC and + RE_SYNTAX_POSIX_EXTENDED. + + regex.c: (re_compile_pattern): Don't allow empty ranges if the + RE_NO_EMPTY_RANGES syntax bit is set. + Don't let a hyphen be a range end if the RE_NO_HYPHEN_RANGE_END + syntax bit is set. + (ESTACK_PUSH_2): renamed this PUSH_FAILURE_POINT and made it + push all the used registers on the stack, as well as the number + of the highest numbered register used, and (as before) the two + failure points. + (re_match_2): Fixed up comments. + Added arrays best_regstart[], best_regstart_seg1[], best_regend[], + and best_regend_seg1[] to keep track of the best match so far + whenever reach the end of the pattern but not the end of the + string, and there are still failure points on the stack with + which to backtrack; if so, do the saving and force a fail. + If reach the end of the pattern but not the end of the string, + but there are no more failure points to try, restore the best + match so far, set the registers and return. + Compacted some code. + In stop_memory case, if the subexpression we've just left is in + a loop, push onto the stack the loop's on_failure_jump failure + point along with the current pointer into the string (d). + In finalize_jump case, in addition to popping the failure + points, pop the saved registers. + In the fail case, restore the registers, as well as the failure + points. + +Sun Feb 18 15:08:10 1990 Kathy Hargreaves (kathy at hayley) + + * regex.c: (global): Defined a macro GET_BUFFER_SPACE which + makes sure you have a specified number of buffer bytes + allocated. + Redefined the macro BUFPUSH to use this. + Added comments. + + (re_compile_pattern): Call GET_BUFFER_SPACE before storing or + inserting any jumps. + + (re_match_2): Set d to string1 + pos and dend to end_match_1 + only if string1 isn't null. + Force exit from a loop if it's around empty parentheses. + In stop_memory case, if found some jumps, increment p2 before + extracting address to which to jump. Also, don't need to know + how many more times can jump_n. + In begline case, d must equal string1 or string2, in that order, + only if they are not null. + In maybe_finalize_jump case, skip over start_memorys' and + stop_memorys' register numbers, too. + +Thu Feb 15 15:53:55 1990 Kathy Hargreaves (kathy at hayley) + + * regex.c (BUFPUSH): off by one goof in deciding whether to + EXTEND_BUFFER. + +Wed Jan 24 17:07:46 1990 Kathy Hargreaves (kathy at hayley) + + * regex.h: Moved definition of NULL to here. + Got rid of ``In other words...'' comment. + Added to some comments. + + regex.c: (re_compile_pattern): Tried to bulletproof some code, + i.e., checked if backward references (e.g., p[-1]) were within + the range of pattern. + + (re_compile_fastmap): Fixed a bug in succeed_n part where was + getting the amount to jump instead of how many times to jump. + + (re_search_2): Changed the name of the variable ``total'' to + ``total_size.'' + Condensed some code. + + (re_match_2): Moved the comment about duplicate from above the + start_memory case to above duplicate case. + + (global): Rewrote some comments. + Added commandline arguments to testing. + +Wed Jan 17 11:47:27 1990 Kathy Hargreaves (kathy at hayley) + + * regex.c: (global): Defined a macro STORE_NUMBER which stores a + number into two contiguous bytes. Also defined STORE_NUMBER_AND_INCR + which does the same thing and then increments the pointer to the + storage place to point after the number. + Defined a macro EXTRACT_NUMBER which extracts a number from two + continguous bytes. Also defined EXTRACT_NUMBER_AND_INCR which + does the same thing and then increments the pointer to the + source to point to after where the number was. + +Tue Jan 16 12:09:19 1990 Kathy Hargreaves (kathy at hayley) + + * regex.h: Incorporated rms' changes. + Defined RE_NO_BK_REFS syntax bit which is set when want to + interpret back reference patterns as literals. + Defined RE_NO_EMPTY_BRACKETS syntax bit which is set when want + empty bracket expressions to be illegal. + Defined RE_CONTEXTUAL_ILLEGAL_OPS syntax bit which is set when want + it to be illegal for *, +, ? and { to be first in an re or come + immediately after a | or a (, and for ^ not to appear in a + nonleading position and $ in a nontrailing position (outside of + bracket expressions, that is). + Defined RE_LIMITED_OPS syntax bit which is set when want +, ? + and | to always be literals instead of ops. + Fixed up the Posix syntax. + Changed the syntax bit comments from saying, e.g., ``0 means...'' + to ``If this bit is set, it means...''. + Changed the syntax bit defines to use shifts instead of integers. + + * regex.c: (global): Incorporated rms' changes. + + (re_compile_pattern): Incorporated rms' changes + Made it illegal for a $ to appear anywhere but inside a bracket + expression or at the end of an re when RE_CONTEXTUAL_ILLEGAL_OPS + is set. Made the same hold for $ except it has to be at the + beginning of an re instead of the end. + Made the re "[]" illegal if RE_NO_EMPTY_BRACKETS is set. + Made it illegal for | to be first or last in an re, or immediately + follow another | or a (. + Added and embellished some comments. + Allowed \{ to be interpreted as a literal if RE_NO_BK_CURLY_BRACES + is set. + Made it illegal for *, +, ?, and { to appear first in an re, or + immediately follow a | or a ( when RE_CONTEXTUAL_ILLEGAL_OPS is set. + Made back references interpreted as literals if RE_NO_BK_REFS is set. + Made recursive intervals either illegal (if RE_NO_BK_CURLY_BRACES + isn't set) or interpreted as literals (if is set), if RE_INTERVALS + is set. + Made it treat +, ? and | as literals if RE_LIMITED_OPS is set. + Cleaned up some code. + +Thu Dec 21 15:31:32 1989 Kathy Hargreaves (kathy at hayley) + + * regex.c: (global): Moved RE_DUP_MAX to regex.h and made it + equal 2^15 - 1 instead of 1000. + Defined NULL to be zero. + Moved the definition of BYTEWIDTH to regex.h. + Made the global variable obscure_syntax nonstatic so the tests in + another file could use it. + + (re_compile_pattern): Defined a maximum length (CHAR_CLASS_MAX_LENGTH) + for character class strings (i.e., what's between the [: and the + :]'s). + Defined a macro SET_LIST_BIT(c) which sets the bit for C in a + character set list. + Took out comments that EXTEND_BUFFER clobbers C. + Made the string "^" match itself, if not RE_CONTEXT_IND_OPS. + Added character classes to bracket expressions. + Change the laststart pointer saved with the start of each + subexpression to point to start_memory instead of after the + following register number. This is because the subexpression + might be in a loop. + Added comments and compacted some code. + Made intervals only work if preceded by an re matching a single + character or a subexpression. + Made back references to nonexistent subexpressions illegal if + using POSIX syntax. + Made intervals work on the last preceding character of a + concatenation of characters, e.g., ab{0,} matches abbb, not abab. + Moved macro PREFETCH to outside the routine. + + (re_compile_fastmap): Added succeed_n to work analogously to + on_failure_jump if n is zero and jump_n to work analogously to + the other backward jumps. + + (re_match_2): Defined macro SET_REGS_MATCHED to set which + current subexpressions had matches within them. + Changed some comments. + Added reg_active and reg_matched_something arrays to keep track + of in which subexpressions currently have matched something. + Defined MATCHING_IN_FIRST_STRING and replaced ``dend == end_match_1'' + with it to make code easier to understand. + Fixed so can apply * and intervals to arbitrarily nested + subexpressions. (Lots of previous bugs here.) + Changed so won't match a newline if syntax bit RE_DOT_NOT_NULL is set. + Made the upcase array nonstatic so the testing file could use it also. + + (main.c): Moved the tests out to another file. + + (tests.c): Moved all the testing stuff here. + +Sat Nov 18 19:30:30 1989 Kathy Hargreaves (kathy at hayley) + + * regex.c: (re_compile_pattern): Defined RE_DUP_MAX, the maximum + number of times an interval can match a pattern. + Added macro GET_UNSIGNED_NUMBER (used to get below): + Added variables lower_bound and upper_bound for upper and lower + bounds of intervals. + Added variable num_fetches so intervals could do backtracking. + Added code to handle '{' and "\{" and intervals. + Added to comments. + + (store_jump_n): (Added) Stores a jump with a number following the + relative address (for intervals). + + (insert_jump_n): (Added) Inserts a jump_n. + + (re_match_2): Defined a macro ESTACK_PUSH_2 for the error stack; + it checks for overflow and reallocates if necessary. + + * regex.h: Added bits (RE_INTERVALS and RE_NO_BK_CURLY_BRACES) + to obscure syntax to indicate whether or not + a syntax handles intervals and recognizes either \{ and + \} or { and } as operators. Also added two syntaxes + RE_SYNTAX_POSIX_BASIC and RE_POSIX_EXTENDED and two command codes + to the enumeration regexpcode; they are succeed_n and jump_n. + +Sat Nov 18 19:30:30 1989 Kathy Hargreaves (kathy at hayley) + + * regex.c: (re_compile_pattern): Defined INIT_BUFF_SIZE to get rid + of repeated constants in code. Tested with value 1. + Renamed PATPUSH as BUFPUSH, since it pushes things onto the + buffer, not the pattern. Also made this macro extend the buffer + if it's full (so could do the following): + Took out code at top of loop that checks to see if buffer is going + to be full after 10 additions (and reallocates if necessary). + + (insert_jump): Rearranged declaration lines so comments would read + better. + + (re_match_2): Compacted exactn code and added more comments. + + (main): Defined macros TEST_MATCH and MATCH_SELF to do + testing; took out loop so could use these instead. + +Tue Oct 24 20:57:18 1989 Kathy Hargreaves (kathy at hayley) + + * regex.c (re_set_syntax): Gave argument `syntax' a type. + (store_jump, insert_jump): made them void functions. + +Local Variables: +mode: indented-text +left-margin: 8 +version-control: never +End: diff --git a/regex-0.12/INSTALL b/regex-0.12/INSTALL @@ -0,0 +1,117 @@ +This is a generic INSTALL file for utilities distributions. +If this package does not come with, e.g., installable documentation or +data files, please ignore the references to them below. + +To compile this package: + +1. Configure the package for your system. In the directory that this +file is in, type `./configure'. If you're using `csh' on an old +version of System V, you might need to type `sh configure' instead to +prevent `csh' from trying to execute `configure' itself. + +The `configure' shell script attempts to guess correct values for +various system-dependent variables used during compilation, and +creates the Makefile(s) (one in each subdirectory of the source +directory). In some packages it creates a C header file containing +system-dependent definitions. It also creates a file `config.status' +that you can run in the future to recreate the current configuration. + +Running `configure' takes a minute or two. While it is running, it +prints some messages that tell what it is doing. If you don't want to +see the messages, run `configure' with its standard output redirected +to `/dev/null'; for example, `./configure >/dev/null'. + +To compile the package in a different directory from the one +containing the source code, you must use a version of `make' that +supports the VPATH variable, such as GNU `make'. `cd' to the directory +where you want the object files and executables to go and run +`configure'. `configure' automatically checks for the source code in +the directory that `configure' is in and in `..'. If for some reason +`configure' is not in the source code directory that you are +configuring, then it will report that it can't find the source code. +In that case, run `configure' with the option `--srcdir=DIR', where +DIR is the directory that contains the source code. + +By default, `make install' will install the package's files in +/usr/local/bin, /usr/local/lib, /usr/local/man, etc. You can specify +an installation prefix other than /usr/local by giving `configure' the +option `--prefix=PATH'. Alternately, you can do so by giving a value +for the `prefix' variable when you run `make', e.g., + make prefix=/usr/gnu + +You can specify separate installation prefixes for +architecture-specific files and architecture-independent files. If +you give `configure' the option `--exec-prefix=PATH' or set the +`make' variable `exec_prefix' to PATH, the package will use PATH as +the prefix for installing programs and libraries. Data files and +documentation will still use the regular prefix. Normally, all files +are installed using the regular prefix. + +Another `configure' option is useful mainly in `Makefile' rules for +updating `config.status' and `Makefile'. The `--no-create' option +figures out the configuration for your system and records it in +`config.status', without actually configuring the package (creating +`Makefile's and perhaps a configuration header file). Later, you can +run `./config.status' to actually configure the package. You can also +give `config.status' the `--recheck' option, which makes it re-run +`configure' with the same arguments you used before. This option is +useful if you change `configure'. + +Some packages pay attention to `--with-PACKAGE' options to `configure', +where PACKAGE is something like `gnu-libc' or `x' (for the X Window System). +The README should mention any --with- options that the package recognizes. + +`configure' ignores any other arguments that you give it. + +If your system requires unusual options for compilation or linking +that `configure' doesn't know about, you can give `configure' initial +values for some variables by setting them in the environment. In +Bourne-compatible shells, you can do that on the command line like +this: + CC='gcc -traditional' DEFS=-D_POSIX_SOURCE ./configure + +The `make' variables that you might want to override with environment +variables when running `configure' are: + +(For these variables, any value given in the environment overrides the +value that `configure' would choose:) +CC C compiler program. + Default is `cc', or `gcc' if `gcc' is in your PATH. +INSTALL Program to use to install files. + Default is `install' if you have it, `cp' otherwise. + +(For these variables, any value given in the environment is added to +the value that `configure' chooses:) +DEFS Configuration options, in the form `-Dfoo -Dbar ...' + Do not use this variable in packages that create a + configuration header file. +LIBS Libraries to link with, in the form `-lfoo -lbar ...' + +If you need to do unusual things to compile the package, we encourage +you to figure out how `configure' could check whether to do them, and +mail diffs or instructions to the address given in the README so we +can include them in the next release. + +2. Type `make' to compile the package. If you want, you can override +the `make' variables CFLAGS and LDFLAGS like this: + + make CFLAGS=-O2 LDFLAGS=-s + +3. If the package comes with self-tests and you want to run them, +type `make check'. If you're not sure whether there are any, try it; +if `make' responds with something like + make: *** No way to make target `check'. Stop. +then the package does not come with self-tests. + +4. Type `make install' to install programs, data files, and +documentation. + +5. You can remove the program binaries and object files from the +source directory by typing `make clean'. To also remove the +Makefile(s), the header file containing system-dependent definitions +(if the package uses one), and `config.status' (all the files that +`configure' created), type `make distclean'. + +The file `configure.in' is used as a template to create `configure' by +a program called `autoconf'. You will only need it if you want to +regenerate `configure' using a newer version of `autoconf'. diff --git a/regex-0.12/Makefile.in b/regex-0.12/Makefile.in @@ -0,0 +1,98 @@ +# Makefile for regex. +# +# Copyright (C) 1992, 1993 Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +version = 0.12 + +# You can define CPPFLAGS on the command line. Aside from system-specific +# flags, you can define: +# -DREGEX_MALLOC to use malloc/realloc/free instead of alloca. +# -DDEBUG to enable the compiled pattern disassembler and execution +# tracing; code runs substantially slower. +# -DEXTRACT_MACROS to use the macros EXTRACT_* (as opposed to +# the corresponding C procedures). If not -DDEBUG, the macros +# are used. +CPPFLAGS = + +# Likewise, you can override CFLAGS to optimize, use -Wall, etc. +CFLAGS = -g + +# Ditto for LDFLAGS and LOADLIBES. +LDFLAGS = +LOADLIBES = + +srcdir = @srcdir@ +VPATH = @srcdir@ + +CC = @CC@ +DEFS = @DEFS@ + +SHELL = /bin/sh + +subdirs = doc test + +default all:: regex.o +.PHONY: default all + +regex.o: regex.c regex.h + $(CC) $(CFLAGS) $(CPPFLAGS) $(DEFS) -I. -I$(srcdir) -c $< + +clean mostlyclean:: + rm -f *.o + +distclean:: clean + rm -f Makefile config.status + +extraclean:: distclean + rm -f patch* *~* *\#* *.orig *.rej *.bak core a.out + +configure: configure.in + autoconf + +config.status: configure + sh configure --no-create + +Makefile: Makefile.in config.status + sh config.status + +makeargs = $(MFLAGS) CPPFLAGS='$(CPPFLAGS)' CFLAGS='$(CFLAGS)' CC='$(CC)' \ +DEFS='$(DEFS)' LDFLAGS='$(LDFLAGS)' LOADLIBES='$(LOADLIBES)' + +default all install \ +mostlyclean clean distclean extraclean realclean \ +TAGS check:: + for d in $(subdirs); do (cd $$d; $(MAKE) $(makeargs) $@); done +.PHONY: install mostlyclean clean distclean extraclean realclean TAGS check + +# Prevent GNU make 3 from overflowing arg limit on system V. +.NOEXPORT: + +distfiles = AUTHORS ChangeLog COPYING INSTALL NEWS README \ + *.in configure regex.c regex.h +distdir = regex-$(version) +distargs = version=$(version) distdir=../$(distdir)/$$d +dist: TAGS configure + @echo "Version numbers in: Makefile.in, ChangeLog, NEWS," + @echo " regex.c, regex.h," + @echo " and doc/xregex.texi (if modified)." + rm -rf $(distdir) + mkdir $(distdir) + ln $(distfiles) $(distdir) + for d in $(subdirs); do (cd $$d; $(MAKE) $(distargs) dist); done + tar czhf $(distdir).tar.Z $(distdir) + rm -rf $(distdir) +.PHONY: dist diff --git a/regex-0.12/NEWS b/regex-0.12/NEWS @@ -0,0 +1,62 @@ +Version 0.12 + +* regex.c does not #define bcmp/bcopy/bzero if they already are. + +* regex.h does not redefine `const' if it is already defined, even if + __STDC__ is not defined. + +* RE_SYNTAX_ED added (same as POSIX BRE's). + +* The following bugs have been fixed, among others: + * The pattern \w+ doesn't infinite loop. + * The pattern ".+\n" is compiled correctly. + * Expressions with more than MAX_REGNUM groups are compiled correctly. + +* Patterns that end in a repetition operator (e.g., `*') match + slightly faster if no looping is actually necessary. + +Version 0.11 (17 Sep 92) + +* Back-references to nonexistent subexpressions, as in the r.e. `abc\1', + are always invalid. Previously, they could match the literal digit, + e.g., the stated r.e. might have matched `abc1'. + +* Empty subexpressions are always valid (POSIX leaves this undefined). + +* Simplified rules for ^ and $ being anchors. + +* One minor speedup (rewriting the C procedure `pop_failure_point' as a + macro again). + +* Bug fixes involving: + - Declarations in regex.h and non-ANSI compilers. + - Bracket expressions with characters between 0x80-0xff. + - Memory leak in re_match_2 on systems requiring `alloca (0)' to + free alloca'd storage. + +* Test and documentation files moved into subdirectories. + +Version 0.10 (9 Sep 92) + +* `obscure_syntax' is now called `re_default_syntax'. + +* `re_comp's return type is no longer `const', for compatibility with BSD. + +* POSIX syntaxes now include as much functionality as possible + (consistent with the standard). + +* Compilation conditionals normalized to what the rest of GNU is + migrating towards these days. + +* Bug fixes involving: + - Ranges with characters between 0x80 and 0xff, e.g., [\001-\377]. + - `re_compile_fastmap' and the sequence `.*\n'. + - Intervals with exact counts, e.g., a{5}. + +* Changed distribution to use a standard Makefile, install the info + files, use a configure script, etc. + +Version 0.9 + +* The longest match was not always chosen: `a*|ab' didn't match `aab'. + diff --git a/regex-0.12/README b/regex-0.12/README @@ -0,0 +1,60 @@ +This directory contains the GNU regex library. It is compliant with +POSIX.2, except for internationalization features. + +See the file NEWS for a list of major changes in the current release. + +See the file INSTALL for compilation instructions. (The only thing +installed is the documentation; regex.c is compiled into regex.o, but +not installed anywhere.) + +The subdirectory `doc' contains a (programmers') manual for the library. +It's probably out-of-date. Improvements are welcome. + +The subdirectory `test' contains the various tests we've written. + +We know this code is not as fast as it might be. If you have specific +suggestions, profiling results, or other such useful information to +report, please do. + +Emacs 18 is not going use this revised regex (but Emacs 19 will). If +you want to try it with Emacs 18, apply the patch at the end of this +file first. + +Mail bug reports to bug-gnu-utils@prep.ai.mit.edu. + +Please include an actual regular expression that fails (and the syntax +used to compile it); without that, there's no way to reproduce the bug, +so there's no way we can fix it. Even if you include a patch, also +include the regular expression in error; otherwise, we can't know for +sure what you're trying to fix. + +Here is the patch to make this version of regex work with Emacs 18. + +*** ORIG/search.c Tue Jan 8 13:04:55 1991 +--- search.c Sun Jan 5 10:57:00 1992 +*************** +*** 25,26 **** +--- 25,28 ---- + #include "commands.h" ++ ++ #include <sys/types.h> + #include "regex.h" +*************** +*** 477,479 **** + /* really needed. */ +! && *(searchbuf.buffer) == (char) exactn /* first item is "exact match" */ + && searchbuf.buffer[1] + 2 == searchbuf.used) /*first is ONLY item */ +--- 479,482 ---- + /* really needed. */ +! /* first item is "exact match" */ +! && *(searchbuf.buffer) == (char) RE_EXACTN_VALUE + && searchbuf.buffer[1] + 2 == searchbuf.used) /*first is ONLY item */ +*************** +*** 1273,1275 **** + searchbuf.allocated = 100; +! searchbuf.buffer = (char *) malloc (searchbuf.allocated); + searchbuf.fastmap = search_fastmap; +--- 1276,1278 ---- + searchbuf.allocated = 100; +! searchbuf.buffer = (unsigned char *) malloc (searchbuf.allocated); + searchbuf.fastmap = search_fastmap; diff --git a/regex-0.12/configure b/regex-0.12/configure @@ -0,0 +1,462 @@ +#!/bin/sh +# Guess values for system-dependent variables and create Makefiles. +# Generated automatically using autoconf. +# Copyright (C) 1991, 1992, 1993 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +# Usage: configure [--srcdir=DIR] [--host=HOST] [--gas] [--nfp] [--no-create] +# [--prefix=PREFIX] [--exec-prefix=PREFIX] [--with-PACKAGE] [TARGET] +# Ignores all args except --srcdir, --prefix, --exec-prefix, --no-create, and +# --with-PACKAGE unless this script has special code to handle it. + + +for arg +do + # Handle --exec-prefix with a space before the argument. + if test x$next_exec_prefix = xyes; then exec_prefix=$arg; next_exec_prefix= + # Handle --host with a space before the argument. + elif test x$next_host = xyes; then next_host= + # Handle --prefix with a space before the argument. + elif test x$next_prefix = xyes; then prefix=$arg; next_prefix= + # Handle --srcdir with a space before the argument. + elif test x$next_srcdir = xyes; then srcdir=$arg; next_srcdir= + else + case $arg in + # For backward compatibility, also recognize exact --exec_prefix. + -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* | --exec=* | --exe=* | --ex=* | --e=*) + exec_prefix=`echo $arg | sed 's/[-a-z_]*=//'` ;; + -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- | --exec | --exe | --ex | --e) + next_exec_prefix=yes ;; + + -gas | --gas | --ga | --g) ;; + + -host=* | --host=* | --hos=* | --ho=* | --h=*) ;; + -host | --host | --hos | --ho | --h) + next_host=yes ;; + + -nfp | --nfp | --nf) ;; + + -no-create | --no-create | --no-creat | --no-crea | --no-cre | --no-cr | --no-c | --no- | --no) + no_create=1 ;; + + -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) + prefix=`echo $arg | sed 's/[-a-z_]*=//'` ;; + -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) + next_prefix=yes ;; + + -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=* | --s=*) + srcdir=`echo $arg | sed 's/[-a-z_]*=//'` ;; + -srcdir | --srcdir | --srcdi | --srcd | --src | --sr | --s) + next_srcdir=yes ;; + + -with-* | --with-*) + package=`echo $arg|sed 's/-*with-//'` + # Delete all the valid chars; see if any are left. + if test -n "`echo $package|sed 's/[-a-zA-Z0-9_]*//g'`"; then + echo "configure: $package: invalid package name" >&2; exit 1 + fi + eval "with_`echo $package|sed s/-/_/g`=1" ;; + + *) ;; + esac + fi +done + +trap 'rm -f conftest* core; exit 1' 1 3 15 + +rm -f conftest* +compile='${CC-cc} $CFLAGS $DEFS conftest.c -o conftest $LIBS >/dev/null 2>&1' + +# A filename unique to this package, relative to the directory that +# configure is in, which we can look for to find out if srcdir is correct. +unique_file=regex.c + +# Find the source files, if location was not specified. +if test -z "$srcdir"; then + srcdirdefaulted=yes + # Try the directory containing this script, then `..'. + prog=$0 + confdir=`echo $prog|sed 's%/[^/][^/]*$%%'` + test "X$confdir" = "X$prog" && confdir=. + srcdir=$confdir + if test ! -r $srcdir/$unique_file; then + srcdir=.. + fi +fi +if test ! -r $srcdir/$unique_file; then + if test x$srcdirdefaulted = xyes; then + echo "configure: Can not find sources in \`${confdir}' or \`..'." 1>&2 + else + echo "configure: Can not find sources in \`${srcdir}'." 1>&2 + fi + exit 1 +fi +# Preserve a srcdir of `.' to avoid automounter screwups with pwd. +# But we can't avoid them for `..', to make subdirectories work. +case $srcdir in + .|/*|~*) ;; + *) srcdir=`cd $srcdir; pwd` ;; # Make relative path absolute. +esac + + +if test -z "$CC"; then + echo checking for gcc + saveifs="$IFS"; IFS="${IFS}:" + for dir in $PATH; do + test -z "$dir" && dir=. + if test -f $dir/gcc; then + CC="gcc" + break + fi + done + IFS="$saveifs" +fi +test -z "$CC" && CC="cc" + +# Find out if we are using GNU C, under whatever name. +cat > conftest.c <<EOF +#ifdef __GNUC__ + yes +#endif +EOF +${CC-cc} -E conftest.c > conftest.out 2>&1 +if egrep yes conftest.out >/dev/null 2>&1; then + GCC=1 # For later tests. +fi +rm -f conftest* + +# Make sure to not get the incompatible SysV /etc/install and +# /usr/sbin/install, which might be in PATH before a BSD-like install, +# or the SunOS /usr/etc/install directory, or the AIX /bin/install, +# or the AFS install, which mishandles nonexistent args. (Sigh.) +if test -z "$INSTALL"; then + echo checking for install + saveifs="$IFS"; IFS="${IFS}:" + for dir in $PATH; do + test -z "$dir" && dir=. + case $dir in + /etc|/usr/sbin|/usr/etc|/usr/afsws/bin) ;; + *) + if test -f $dir/install; then + if grep dspmsg $dir/install >/dev/null 2>&1; then + : # AIX + else + INSTALL="$dir/install -c" + INSTALL_PROGRAM='$(INSTALL)' + INSTALL_DATA='$(INSTALL) -m 644' + break + fi + fi + ;; + esac + done + IFS="$saveifs" +fi +INSTALL=${INSTALL-cp} +INSTALL_PROGRAM=${INSTALL_PROGRAM-'$(INSTALL)'} +INSTALL_DATA=${INSTALL_DATA-'$(INSTALL)'} + + +echo checking for AIX +echo checking how to run the C preprocessor +if test -z "$CPP"; then + CPP='${CC-cc} -E' + cat > conftest.c <<EOF +#include <stdio.h> +EOF +err=`eval "$CPP $DEFS conftest.c 2>&1 >/dev/null"` +if test -z "$err"; then + : +else + CPP=/lib/cpp +fi +rm -f conftest* +fi + +cat > conftest.c <<EOF +#ifdef _AIX + yes +#endif + +EOF +eval "$CPP $DEFS conftest.c > conftest.out 2>&1" +if egrep "yes" conftest.out >/dev/null 2>&1; then + DEFS="$DEFS -D_ALL_SOURCE=1" +fi +rm -f conftest* + + +echo checking for DYNIX/ptx libseq +cat > conftest.c <<EOF +#if defined(_SEQUENT_) + yes +#endif + +EOF +eval "$CPP $DEFS conftest.c > conftest.out 2>&1" +if egrep "yes" conftest.out >/dev/null 2>&1; then + SEQUENT=1 +fi +rm -f conftest* + +test -n "$SEQUENT" && test -f /usr/lib/libseq.a && + LIBS="$LIBS -lseq" + +echo checking for POSIXized ISC +if test -d /etc/conf/kconfig.d && + grep _POSIX_VERSION /usr/include/sys/unistd.h >/dev/null 2>&1 +then + ISC=1 # If later tests want to check for ISC. + DEFS="$DEFS -D_POSIX_SOURCE=1" + if test -n "$GCC"; then + CC="$CC -posix" + else + CC="$CC -Xp" + fi +fi + +echo checking for minix/config.h +cat > conftest.c <<EOF +#include <minix/config.h> +EOF +err=`eval "$CPP $DEFS conftest.c 2>&1 >/dev/null"` +if test -z "$err"; then + MINIX=1 +fi +rm -f conftest* + +# The Minix shell can't assign to the same variable on the same line! +if test -n "$MINIX"; then + DEFS="$DEFS -D_POSIX_SOURCE=1" + DEFS="$DEFS -D_POSIX_1_SOURCE=2" + DEFS="$DEFS -D_MINIX=1" +fi + + +echo checking for ANSI C header files +cat > conftest.c <<EOF +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> +#include <float.h> +EOF +err=`eval "$CPP $DEFS conftest.c 2>&1 >/dev/null"` +if test -z "$err"; then + # SunOS string.h does not declare mem*, contrary to ANSI. +echo '#include <string.h>' > conftest.c +eval "$CPP $DEFS conftest.c > conftest.out 2>&1" +if egrep "memchr" conftest.out >/dev/null 2>&1; then + # SGI's /bin/cc from Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi. +cat > conftest.c <<EOF +#include <ctype.h> +#define ISLOWER(c) ('a' <= (c) && (c) <= 'z') +#define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c)) +#define XOR(e,f) (((e) && !(f)) || (!(e) && (f))) +int main () { int i; for (i = 0; i < 256; i++) +if (XOR (islower (i), ISLOWER (i)) || toupper (i) != TOUPPER (i)) exit(2); +exit (0); } + +EOF +eval $compile +if test -s conftest && (./conftest; exit) 2>/dev/null; then + DEFS="$DEFS -DSTDC_HEADERS=1" +fi +rm -f conftest* +fi +rm -f conftest* + +fi +rm -f conftest* + +for hdr in string.h +do +trhdr=HAVE_`echo $hdr | tr '[a-z]./' '[A-Z]__'` +echo checking for ${hdr} +cat > conftest.c <<EOF +#include <${hdr}> +EOF +err=`eval "$CPP $DEFS conftest.c 2>&1 >/dev/null"` +if test -z "$err"; then + DEFS="$DEFS -D${trhdr}=1" +fi +rm -f conftest* +done + + +# The Ultrix 4.2 mips builtin alloca declared by alloca.h only works +# for constant arguments. Useless! +echo checking for working alloca.h +cat > conftest.c <<EOF +#include <alloca.h> +main() { exit(0); } +t() { char *p = alloca(2 * sizeof(int)); } +EOF +if eval $compile; then + DEFS="$DEFS -DHAVE_ALLOCA_H=1" +fi +rm -f conftest* + +decl="#ifdef __GNUC__ +#define alloca __builtin_alloca +#else +#if HAVE_ALLOCA_H +#include <alloca.h> +#else +#ifdef _AIX + #pragma alloca +#else +char *alloca (); +#endif +#endif +#endif +" +echo checking for alloca +cat > conftest.c <<EOF +$decl +main() { exit(0); } +t() { char *p = (char *) alloca(1); } +EOF +if eval $compile; then + : +else + alloca_missing=1 +fi +rm -f conftest* + +if test -n "$alloca_missing"; then + # The SVR3 libPW and SVR4 libucb both contain incompatible functions + # that cause trouble. Some versions do not even contain alloca or + # contain a buggy version. If you still want to use their alloca, + # use ar to extract alloca.o from them instead of compiling alloca.c. + ALLOCA=alloca.o +fi + +prog='/* Ultrix mips cc rejects this. */ +typedef int charset[2]; const charset x; +/* SunOS 4.1.1 cc rejects this. */ +char const *const *p; +char **p2; +/* HPUX 7.0 cc rejects these. */ +++p; +p2 = (char const* const*) p;' +echo checking for working const +cat > conftest.c <<EOF + +main() { exit(0); } +t() { $prog } +EOF +if eval $compile; then + : +else + DEFS="$DEFS -Dconst=" +fi +rm -f conftest* + + +if test -z "$prefix" +then + echo checking for gcc to derive installation directory prefix + saveifs="$IFS"; IFS="$IFS:" + for dir in $PATH; do + test -z "$dir" && dir=. + if test $dir != . && test -f $dir/gcc; then + # Not all systems have dirname. + prefix=`echo $dir|sed 's%/[^/][^/]*$%%'` + break + fi + done + IFS="$saveifs" +fi + + +if test -n "$prefix"; then + test -z "$exec_prefix" && exec_prefix='${prefix}' + prsub="s%^prefix\\([ ]*\\)=\\([ ]*\\).*$%prefix\\1=\\2$prefix%" +fi +if test -n "$exec_prefix"; then + prsub="$prsub +s%^exec_prefix\\([ ]*\\)=\\([ ]*\\).*$%\ +exec_prefix\\1=\\2$exec_prefix%" +fi + +trap 'rm -f config.status; exit 1' 1 3 15 +echo creating config.status +rm -f config.status +cat > config.status <<EOF +#!/bin/sh +# Generated automatically by configure. +# Run this file to recreate the current configuration. +# This directory was configured as follows, +# on host `(hostname || uname -n) 2>/dev/null`: +# +# $0 $* + +for arg +do + case "\$arg" in + -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) + exec /bin/sh $0 $* ;; + *) echo "Usage: config.status --recheck" 2>&1; exit 1 ;; + esac +done + +trap 'rm -f Makefile doc/Makefile test/Makefile; exit 1' 1 3 15 +CC='$CC' +INSTALL='$INSTALL' +INSTALL_PROGRAM='$INSTALL_PROGRAM' +INSTALL_DATA='$INSTALL_DATA' +CPP='$CPP' +ALLOCA='$ALLOCA' +LIBS='$LIBS' +srcdir='$srcdir' +DEFS='$DEFS' +prefix='$prefix' +exec_prefix='$exec_prefix' +prsub='$prsub' +EOF +cat >> config.status <<\EOF + +top_srcdir=$srcdir +for file in .. Makefile doc/Makefile test/Makefile; do if [ "x$file" != "x.." ]; then + srcdir=$top_srcdir + # Remove last slash and all that follows it. Not all systems have dirname. + dir=`echo $file|sed 's%/[^/][^/]*$%%'` + if test "$dir" != "$file"; then + test "$top_srcdir" != . && srcdir=$top_srcdir/$dir + test ! -d $dir && mkdir $dir + fi + echo creating $file + rm -f $file + echo "# Generated automatically from `echo $file|sed 's|.*/||'`.in by configure." > $file + sed -e " +$prsub +s%@CC@%$CC%g +s%@INSTALL@%$INSTALL%g +s%@INSTALL_PROGRAM@%$INSTALL_PROGRAM%g +s%@INSTALL_DATA@%$INSTALL_DATA%g +s%@CPP@%$CPP%g +s%@ALLOCA@%$ALLOCA%g +s%@LIBS@%$LIBS%g +s%@srcdir@%$srcdir%g +s%@DEFS@%$DEFS% +" $top_srcdir/${file}.in >> $file +fi; done + +exit 0 +EOF +chmod +x config.status +test -n "$no_create" || ./config.status + diff --git a/regex-0.12/configure.in b/regex-0.12/configure.in @@ -0,0 +1,23 @@ +dnl Process this file with autoconf to produce a configure script. +AC_INIT(regex.c) + +AC_PROG_CC +AC_PROG_INSTALL + +dnl I'm not sure if AC_AIX and AC_DYNIX_SEQ are really necessary. The +dnl Autoconf documentation isn't specific about which BSD functions they +dnl provide. +AC_AIX +AC_DYNIX_SEQ +AC_ISC_POSIX +AC_MINIX + +AC_STDC_HEADERS +AC_HAVE_HEADERS(string.h) + +AC_ALLOCA +AC_CONST + +AC_PREFIX(gcc) + +AC_OUTPUT(Makefile doc/Makefile test/Makefile) diff --git a/regex-0.12/doc/Makefile.in b/regex-0.12/doc/Makefile.in @@ -0,0 +1,92 @@ +# Makefile for regex documentation. +# +# Copyright (C) 1992 Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +# Installation directories. +prefix = /usr/local +infodir = $(prefix)/info + +srcdir = @srcdir@ +VPATH = @srcdir@:../@srcdir@ + +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ + +MAKEINFO = makeinfo --no-split +SHELL = /bin/sh +TEX = tex +TEXINDEX = texindex + +default all: regex.info regex.dvi +.PHONY: default all + +# We need to include some code from regex.h. +regex.texi: xregex.texi + rm -f $@ + gawk -f include.awk -vsource=../$(srcdir)/regex.h \ + <../$(srcdir)/doc/xregex.texi \ + | expand >$@ + chmod a-w $@ + +regex.dvi: regex.cps + $(TEX) regex.texi +regex.cps: regex.cp + $(TEXINDEX) regex.?? +regex.cp: regex.texi + $(TEX) ../$(srcdir)/doc/regex.texi + +regex.info: regex.texi + $(MAKEINFO) ../$(srcdir)/doc/regex.texi + +# I know of no way to make a good TAGS file from Texinfo source. +TAGS: + +check: +.PHONY: check + +install: regex.info + -mkdir $(prefix) $(infodir) + for i in *.info*; do $(INSTALL_DATA) $$i $(infodir)/$$i; done +.PHONY: install + +clean mostlyclean: + rm -f regex.?? *.dvi *.log *.toc + +distclean: clean + rm -f Makefile + for f in regex.??s; do if test -z "`cat $$f`"; then rm -f $$f; fi; done + +realclean: distclean + rm -f *.info* regex.??? regex.texi TAGS + +extraclean: distclean + rm -f patch* *~* *\#* *.orig *.rej *.bak core a.out +.PHONY: mostlyclean clean distclean realclean extraclean + +Makefile: Makefile.in ../config.status + (cd ..; sh config.status) + +# Prevent GNU make 3 from overflowing arg limit on system V. +.NOEXPORT: + +# Assumes $(distdir) is the place to put our files. +distfiles = Makefile.in *.texi texinfo.tex include.awk \ + regex.info* regex.aux regex.cps +dist: Makefile regex.info regex.cps + mkdir $(distdir) + ln $(distfiles) $(distdir) +.PHONY: dist diff --git a/regex-0.12/doc/include.awk b/regex-0.12/doc/include.awk @@ -0,0 +1,19 @@ +# Assume `source' is set with -vsource=filename on the command line. +# +/^\[\[\[/ { inclusion = $2; # name of the thing to include. + printing = 0; + while ((getline line < source) > 0) + { + if (match (line, "\\[\\[\\[end " inclusion "\\]\\]\\]")) + printing = 0; + + if (printing) + print line; + + if (match (line,"\\[\\[\\[begin " inclusion "\\]\\]\\]")) + printing = 1; + } + close (source); + next; + } + { print } diff --git a/regex-0.12/doc/regex.aux b/regex-0.12/doc/regex.aux @@ -0,0 +1,136 @@ +'xrdef {Overview-pg}{1} +'xrdef {Overview-snt}{Chapter'tie1} +'xrdef {Regular Expression Syntax-pg}{2} +'xrdef {Regular Expression Syntax-snt}{Chapter'tie2} +'xrdef {Syntax Bits-pg}{2} +'xrdef {Syntax Bits-snt}{Section'tie2.1} +'xrdef {Predefined Syntaxes-pg}{5} +'xrdef {Predefined Syntaxes-snt}{Section'tie2.2} +'xrdef {Collating Elements vs. Characters-pg}{6} +'xrdef {Collating Elements vs. Characters-snt}{Section'tie2.3} +'xrdef {The Backslash Character-pg}{7} +'xrdef {The Backslash Character-snt}{Section'tie2.4} +'xrdef {Common Operators-pg}{9} +'xrdef {Common Operators-snt}{Chapter'tie3} +'xrdef {Match-self Operator-pg}{9} +'xrdef {Match-self Operator-snt}{Section'tie3.1} +'xrdef {Match-any-character Operator-pg}{9} +'xrdef {Match-any-character Operator-snt}{Section'tie3.2} +'xrdef {Concatenation Operator-pg}{10} +'xrdef {Concatenation Operator-snt}{Section'tie3.3} +'xrdef {Repetition Operators-pg}{10} +'xrdef {Repetition Operators-snt}{Section'tie3.4} +'xrdef {Match-zero-or-more Operator-pg}{10} +'xrdef {Match-zero-or-more Operator-snt}{Section'tie3.4.1} +'xrdef {Match-one-or-more Operator-pg}{11} +'xrdef {Match-one-or-more Operator-snt}{Section'tie3.4.2} +'xrdef {Match-zero-or-one Operator-pg}{11} +'xrdef {Match-zero-or-one Operator-snt}{Section'tie3.4.3} +'xrdef {Interval Operators-pg}{12} +'xrdef {Interval Operators-snt}{Section'tie3.4.4} +'xrdef {Alternation Operator-pg}{13} +'xrdef {Alternation Operator-snt}{Section'tie3.5} +'xrdef {List Operators-pg}{13} +'xrdef {List Operators-snt}{Section'tie3.6} +'xrdef {Character Class Operators-pg}{14} +'xrdef {Character Class Operators-snt}{Section'tie3.6.1} +'xrdef {Range Operator-pg}{15} +'xrdef {Range Operator-snt}{Section'tie3.6.2} +'xrdef {Grouping Operators-pg}{16} +'xrdef {Grouping Operators-snt}{Section'tie3.7} +'xrdef {Back-reference Operator-pg}{17} +'xrdef {Back-reference Operator-snt}{Section'tie3.8} +'xrdef {Anchoring Operators-pg}{18} +'xrdef {Anchoring Operators-snt}{Section'tie3.9} +'xrdef {Match-beginning-of-line Operator-pg}{18} +'xrdef {Match-beginning-of-line Operator-snt}{Section'tie3.9.1} +'xrdef {Match-end-of-line Operator-pg}{18} +'xrdef {Match-end-of-line Operator-snt}{Section'tie3.9.2} +'xrdef {GNU Operators-pg}{20} +'xrdef {GNU Operators-snt}{Chapter'tie4} +'xrdef {Word Operators-pg}{20} +'xrdef {Word Operators-snt}{Section'tie4.1} +'xrdef {Non-Emacs Syntax Tables-pg}{20} +'xrdef {Non-Emacs Syntax Tables-snt}{Section'tie4.1.1} +'xrdef {Match-word-boundary Operator-pg}{20} +'xrdef {Match-word-boundary Operator-snt}{Section'tie4.1.2} +'xrdef {Match-within-word Operator-pg}{20} +'xrdef {Match-within-word Operator-snt}{Section'tie4.1.3} +'xrdef {Match-beginning-of-word Operator-pg}{21} +'xrdef {Match-beginning-of-word Operator-snt}{Section'tie4.1.4} +'xrdef {Match-end-of-word Operator-pg}{21} +'xrdef {Match-end-of-word Operator-snt}{Section'tie4.1.5} +'xrdef {Match-word-constituent Operator-pg}{21} +'xrdef {Match-word-constituent Operator-snt}{Section'tie4.1.6} +'xrdef {Match-non-word-constituent Operator-pg}{21} +'xrdef {Match-non-word-constituent Operator-snt}{Section'tie4.1.7} +'xrdef {Buffer Operators-pg}{21} +'xrdef {Buffer Operators-snt}{Section'tie4.2} +'xrdef {Match-beginning-of-buffer Operator-pg}{21} +'xrdef {Match-beginning-of-buffer Operator-snt}{Section'tie4.2.1} +'xrdef {Match-end-of-buffer Operator-pg}{21} +'xrdef {Match-end-of-buffer Operator-snt}{Section'tie4.2.2} +'xrdef {GNU Emacs Operators-pg}{22} +'xrdef {GNU Emacs Operators-snt}{Chapter'tie5} +'xrdef {Syntactic Class Operators-pg}{22} +'xrdef {Syntactic Class Operators-snt}{Section'tie5.1} +'xrdef {Emacs Syntax Tables-pg}{22} +'xrdef {Emacs Syntax Tables-snt}{Section'tie5.1.1} +'xrdef {Match-syntactic-class Operator-pg}{22} +'xrdef {Match-syntactic-class Operator-snt}{Section'tie5.1.2} +'xrdef {Match-not-syntactic-class Operator-pg}{22} +'xrdef {Match-not-syntactic-class Operator-snt}{Section'tie5.1.3} +'xrdef {What Gets Matched?-pg}{23} +'xrdef {What Gets Matched?-snt}{Chapter'tie6} +'xrdef {Programming with Regex-pg}{24} +'xrdef {Programming with Regex-snt}{Chapter'tie7} +'xrdef {GNU Regex Functions-pg}{24} +'xrdef {GNU Regex Functions-snt}{Section'tie7.1} +'xrdef {GNU Pattern Buffers-pg}{24} +'xrdef {GNU Pattern Buffers-snt}{Section'tie7.1.1} +'xrdef {GNU Regular Expression Compiling-pg}{26} +'xrdef {GNU Regular Expression Compiling-snt}{Section'tie7.1.2} +'xrdef {GNU Matching-pg}{27} +'xrdef {GNU Matching-snt}{Section'tie7.1.3} +'xrdef {GNU Searching-pg}{28} +'xrdef {GNU Searching-snt}{Section'tie7.1.4} +'xrdef {Matching/Searching with Split Data-pg}{29} +'xrdef {Matching/Searching with Split Data-snt}{Section'tie7.1.5} +'xrdef {Searching with Fastmaps-pg}{30} +'xrdef {Searching with Fastmaps-snt}{Section'tie7.1.6} +'xrdef {GNU Translate Tables-pg}{31} +'xrdef {GNU Translate Tables-snt}{Section'tie7.1.7} +'xrdef {Using Registers-pg}{32} +'xrdef {Using Registers-snt}{Section'tie7.1.8} +'xrdef {Freeing GNU Pattern Buffers-pg}{34} +'xrdef {Freeing GNU Pattern Buffers-snt}{Section'tie7.1.9} +'xrdef {POSIX Regex Functions-pg}{35} +'xrdef {POSIX Regex Functions-snt}{Section'tie7.2} +'xrdef {POSIX Pattern Buffers-pg}{35} +'xrdef {POSIX Pattern Buffers-snt}{Section'tie7.2.1} +'xrdef {POSIX Regular Expression Compiling-pg}{35} +'xrdef {POSIX Regular Expression Compiling-snt}{Section'tie7.2.2} +'xrdef {POSIX Matching-pg}{37} +'xrdef {POSIX Matching-snt}{Section'tie7.2.3} +'xrdef {Reporting Errors-pg}{38} +'xrdef {Reporting Errors-snt}{Section'tie7.2.4} +'xrdef {Using Byte Offsets-pg}{39} +'xrdef {Using Byte Offsets-snt}{Section'tie7.2.5} +'xrdef {Freeing POSIX Pattern Buffers-pg}{39} +'xrdef {Freeing POSIX Pattern Buffers-snt}{Section'tie7.2.6} +'xrdef {BSD Regex Functions-pg}{40} +'xrdef {BSD Regex Functions-snt}{Section'tie7.3} +'xrdef {BSD Regular Expression Compiling-pg}{40} +'xrdef {BSD Regular Expression Compiling-snt}{Section'tie7.3.1} +'xrdef {BSD Searching-pg}{40} +'xrdef {BSD Searching-snt}{Section'tie7.3.2} +'xrdef {Copying-pg}{42} +'xrdef {Copying-snt}{Appendix'tie'char65{}} +'xrdef {Copying-pg}{42} +'xrdef {Copying-snt}{} +'xrdef {Copying-pg}{43} +'xrdef {Copying-snt}{} +'xrdef {Copying-pg}{48} +'xrdef {Copying-snt}{} +'xrdef {Index-pg}{50} +'xrdef {Index-snt}{} diff --git a/regex-0.12/doc/regex.cps b/regex-0.12/doc/regex.cps @@ -0,0 +1,152 @@ +\initial {$} +\entry {\code {$}}{18} +\initial {(} +\entry {\code {(}}{16} +\initial {)} +\entry {\code {)}}{16} +\initial {*} +\entry {\samp {*}}{10} +\initial {-} +\entry {\samp {-}}{13} +\initial {.} +\entry {\samp {.}}{9} +\initial {:} +\entry {\samp {:]} in regex}{14} +\initial {?} +\entry {\samp {?}}{11} +\initial {[} +\entry {\samp {[}}{13} +\entry {\samp {[:} in regex}{14} +\entry {\samp {[{\tt\hat}}}{13} +\initial {]} +\entry {\samp {]}}{13} +\initial {{\tt\char'173}} +\entry {\samp {{\tt\char'173}}}{12} +\initial {{\tt\char'174}} +\entry {\code {{\tt\char'174}}}{13} +\initial {{\tt\char'175}} +\entry {\samp {{\tt\char'175}}}{12} +\initial {{\tt\char43}} +\entry {\samp {{\tt\char43}}}{11} +\initial {{\tt\hat}} +\entry {\samp {{\tt\hat}}}{13} +\entry {\code {{\tt\hat}}}{18} +\initial {{\tt\indexbackslash }} +\entry {{\tt\indexbackslash }}{7} +\entry {\samp {{\tt\indexbackslash }}}{13} +\entry {\samp {{\tt\indexbackslash }'}}{21} +\entry {\code {{\tt\indexbackslash }(}}{16} +\entry {\code {{\tt\indexbackslash })}}{16} +\entry {\samp {{\tt\indexbackslash }`}}{21} +\entry {\samp {{\tt\indexbackslash }{\tt\char'173}}}{12} +\entry {\code {{\tt\indexbackslash }{\tt\char'174}}}{13} +\entry {\samp {{\tt\indexbackslash }{\tt\char'175}}}{12} +\entry {\samp {{\tt\indexbackslash }{\tt\gtr}}}{21} +\entry {\samp {{\tt\indexbackslash }{\tt\less}}}{21} +\entry {\samp {{\tt\indexbackslash }b}}{20} +\entry {\samp {{\tt\indexbackslash }B}}{20} +\entry {\samp {{\tt\indexbackslash }s}}{22} +\entry {\samp {{\tt\indexbackslash }S}}{22} +\entry {\samp {{\tt\indexbackslash }w}}{21} +\entry {\samp {{\tt\indexbackslash }W}}{21} +\initial {A} +\entry {\code {allocated \r {initialization}}}{26} +\entry {alternation operator}{13} +\entry {alternation operator and \samp {{\tt\hat}}}{18} +\entry {anchoring}{18} +\entry {anchors}{18} +\entry {Awk}{5} +\initial {B} +\entry {back references}{17} +\entry {backtracking}{10, 13} +\entry {beginning-of-line operator}{18} +\entry {bracket expression}{13} +\entry {\code {buffer \r {field, set by \code {re{\_}compile{\_}pattern}}}}{27} +\entry {\code {buffer \r {initialization}}}{26} +\initial {C} +\entry {character classes}{14} +\initial {E} +\entry {Egrep}{5} +\entry {Emacs}{5} +\entry {end-of-line operator}{18} +\entry {\code {end\penalty 10000{\spaceskip = 0pt{} }\r {in\penalty 10000{\spaceskip = 0pt{} }\code {struct\penalty 10000{\spaceskip = 0pt{} }re_registers}}}}{32} +\initial {F} +\entry {\code {fastmap \r {initialization}}}{26} +\entry {\code {fastmap{\_}accurate \r {field, set by \code {re{\_}compile{\_}pattern}}}}{27} +\entry {fastmaps}{30} +\initial {G} +\entry {Grep}{5} +\entry {grouping}{16} +\initial {I} +\entry {ignoring case}{35} +\entry {interval expression}{12} +\initial {M} +\entry {matching list}{13} +\entry {matching newline}{13} +\entry {matching with GNU functions}{27} +\initial {N} +\entry {\code {newline{\_}anchor \r {field in pattern buffer}}}{18} +\entry {nonmatching list}{13} +\entry {\code {not{\_}bol \r {field in pattern buffer}}}{18} +\entry {\code {num_regs\penalty 10000{\spaceskip = 0pt{} }\r {in\penalty 10000{\spaceskip = 0pt{} }\code {struct\penalty 10000{\spaceskip = 0pt{} }re_registers}}}}{32} +\initial {O} +\entry {open-group operator and \samp {{\tt\hat}}}{18} +\entry {or operator}{13} +\initial {P} +\entry {parenthesizing}{16} +\entry {pattern buffer initialization}{26} +\entry {pattern buffer, definition of}{24} +\entry {POSIX Awk}{5} +\initial {R} +\entry {\code {range \r {argument to \code {re{\_}search}}}}{28} +\entry {\code {re_registers}}{32} +\entry {\code {RE{\_}BACKSLASH{\_}ESCAPE{\_}IN{\_}LIST}}{3} +\entry {\code {RE{\_}BK{\_}PLUS{\_}QM}}{3} +\entry {\code {RE{\_}CHAR{\_}CLASSES}}{3} +\entry {\code {RE{\_}CONTEXT{\_}INDEP{\_}ANCHORS}}{3} +\entry {\code {RE{\_}CONTEXT{\_}INDEP{\_}ANCHORS \r {(and \samp {{\tt\hat}})}}}{18} +\entry {\code {RE{\_}CONTEXT{\_}INDEP{\_}OPS}}{3} +\entry {\code {RE{\_}CONTEXT{\_}INVALID{\_}OPS}}{3} +\entry {\code {RE{\_}DOT{\_}NEWLINE}}{3} +\entry {\code {RE{\_}DOT{\_}NOT{\_}NULL}}{4} +\entry {\code {RE{\_}INTERVALS}}{4} +\entry {\code {RE{\_}LIMITED{\_}OPS}}{4} +\entry {\code {RE{\_}NEWLINE{\_}ALT}}{4} +\entry {\code {RE{\_}NO{\_}BK{\_}BRACES}}{4} +\entry {\code {RE{\_}NO{\_}BK{\_}PARENS}}{4} +\entry {\code {RE{\_}NO{\_}BK{\_}REFS}}{4} +\entry {\code {RE{\_}NO{\_}BK{\_}VBAR}}{4} +\entry {\code {RE{\_}NO{\_}EMPTY{\_}RANGES}}{4} +\entry {\code {re{\_}nsub \r {field, set by \code {re{\_}compile{\_}pattern}}}}{27} +\entry {\code {re{\_}pattern{\_}buffer \r {definition}}}{24} +\entry {\code {re{\_}syntax{\_}options \r {initialization}}}{26} +\entry {\code {RE{\_}UNMATCHED{\_}RIGHT{\_}PAREN{\_}ORD}}{4} +\entry {\code {REG{\_}EXTENDED}}{35} +\entry {\code {REG{\_}ICASE}}{35} +\entry {\code {REG{\_}NEWLINE}}{36} +\entry {\code {REG{\_}NOSUB}}{35} +\entry {\code {regex.c}}{1} +\entry {\code {regex.h}}{1} +\entry {regexp anchoring}{18} +\entry {\code {regmatch{\_}t}}{39} +\entry {\code {regs{\_}allocated}}{32} +\entry {\code {REGS{\_}FIXED}}{33} +\entry {\code {REGS{\_}REALLOCATE}}{32} +\entry {\code {REGS{\_}UNALLOCATED}}{32} +\entry {regular expressions, syntax of}{2} +\initial {S} +\entry {searching with GNU functions}{28} +\entry {\code {start \r {argument to \code {re{\_}search}}}}{28} +\entry {\code {start\penalty 10000{\spaceskip = 0pt{} }\r {in\penalty 10000{\spaceskip = 0pt{} }\code {struct\penalty 10000{\spaceskip = 0pt{} }re_registers}}}}{32} +\entry {\code {struct re{\_}pattern{\_}buffer \r {definition}}}{24} +\entry {subexpressions}{16} +\entry {syntax bits}{2} +\entry {\code {syntax \r {field, set by \code {re{\_}compile{\_}pattern}}}}{27} +\entry {syntax initialization}{26} +\entry {syntax of regular expressions}{2} +\initial {T} +\entry {\code {translate \r {initialization}}}{26} +\initial {U} +\entry {\code {used \r {field, set by \code {re{\_}compile{\_}pattern}}}}{27} +\initial {W} +\entry {word boundaries, matching}{20} diff --git a/regex-0.12/doc/regex.info b/regex-0.12/doc/regex.info @@ -0,0 +1,2836 @@ +This is Info file regex.info, produced by Makeinfo-1.52 from the input +file .././doc/regex.texi. + + This file documents the GNU regular expression library. + + Copyright (C) 1992, 1993 Free Software Foundation, Inc. + + Permission is granted to make and distribute verbatim copies of this +manual provided the copyright notice and this permission notice are +preserved on all copies. + + Permission is granted to copy and distribute modified versions of this +manual under the conditions for verbatim copying, provided also that the +section entitled "GNU General Public License" is included exactly as in +the original, and provided that the entire resulting derived work is +distributed under the terms of a permission notice identical to this +one. + + Permission is granted to copy and distribute translations of this +manual into another language, under the above conditions for modified +versions, except that the section entitled "GNU General Public License" +may be included in a translation approved by the Free Software +Foundation instead of in the original English. + + +File: regex.info, Node: Top, Next: Overview, Prev: (dir), Up: (dir) + +Regular Expression Library +************************** + + This manual documents how to program with the GNU regular expression +library. This is edition 0.12a of the manual, 19 September 1992. + + The first part of this master menu lists the major nodes in this Info +document, including the index. The rest of the menu lists all the +lower level nodes in the document. + +* Menu: + +* Overview:: +* Regular Expression Syntax:: +* Common Operators:: +* GNU Operators:: +* GNU Emacs Operators:: +* What Gets Matched?:: +* Programming with Regex:: +* Copying:: Copying and sharing Regex. +* Index:: General index. + -- The Detailed Node Listing -- + +Regular Expression Syntax + +* Syntax Bits:: +* Predefined Syntaxes:: +* Collating Elements vs. Characters:: +* The Backslash Character:: + +Common Operators + +* Match-self Operator:: Ordinary characters. +* Match-any-character Operator:: . +* Concatenation Operator:: Juxtaposition. +* Repetition Operators:: * + ? {} +* Alternation Operator:: | +* List Operators:: [...] [^...] +* Grouping Operators:: (...) +* Back-reference Operator:: \digit +* Anchoring Operators:: ^ $ + +Repetition Operators + +* Match-zero-or-more Operator:: * +* Match-one-or-more Operator:: + +* Match-zero-or-one Operator:: ? +* Interval Operators:: {} + +List Operators (`[' ... `]' and `[^' ... `]') + +* Character Class Operators:: [:class:] +* Range Operator:: start-end + +Anchoring Operators + +* Match-beginning-of-line Operator:: ^ +* Match-end-of-line Operator:: $ + +GNU Operators + +* Word Operators:: +* Buffer Operators:: + +Word Operators + +* Non-Emacs Syntax Tables:: +* Match-word-boundary Operator:: \b +* Match-within-word Operator:: \B +* Match-beginning-of-word Operator:: \< +* Match-end-of-word Operator:: \> +* Match-word-constituent Operator:: \w +* Match-non-word-constituent Operator:: \W + +Buffer Operators + +* Match-beginning-of-buffer Operator:: \` +* Match-end-of-buffer Operator:: \' + +GNU Emacs Operators + +* Syntactic Class Operators:: + +Syntactic Class Operators + +* Emacs Syntax Tables:: +* Match-syntactic-class Operator:: \sCLASS +* Match-not-syntactic-class Operator:: \SCLASS + +Programming with Regex + +* GNU Regex Functions:: +* POSIX Regex Functions:: +* BSD Regex Functions:: + +GNU Regex Functions + +* GNU Pattern Buffers:: The re_pattern_buffer type. +* GNU Regular Expression Compiling:: re_compile_pattern () +* GNU Matching:: re_match () +* GNU Searching:: re_search () +* Matching/Searching with Split Data:: re_match_2 (), re_search_2 () +* Searching with Fastmaps:: re_compile_fastmap () +* GNU Translate Tables:: The `translate' field. +* Using Registers:: The re_registers type and related fns. +* Freeing GNU Pattern Buffers:: regfree () + +POSIX Regex Functions + +* POSIX Pattern Buffers:: The regex_t type. +* POSIX Regular Expression Compiling:: regcomp () +* POSIX Matching:: regexec () +* Reporting Errors:: regerror () +* Using Byte Offsets:: The regmatch_t type. +* Freeing POSIX Pattern Buffers:: regfree () + +BSD Regex Functions + +* BSD Regular Expression Compiling:: re_comp () +* BSD Searching:: re_exec () + + +File: regex.info, Node: Overview, Next: Regular Expression Syntax, Prev: Top, Up: Top + +Overview +******** + + A "regular expression" (or "regexp", or "pattern") is a text string +that describes some (mathematical) set of strings. A regexp R +"matches" a string S if S is in the set of strings described by R. + + Using the Regex library, you can: + + * see if a string matches a specified pattern as a whole, and + + * search within a string for a substring matching a specified + pattern. + + Some regular expressions match only one string, i.e., the set they +describe has only one member. For example, the regular expression +`foo' matches the string `foo' and no others. Other regular +expressions match more than one string, i.e., the set they describe has +more than one member. For example, the regular expression `f*' matches +the set of strings made up of any number (including zero) of `f's. As +you can see, some characters in regular expressions match themselves +(such as `f') and some don't (such as `*'); the ones that don't match +themselves instead let you specify patterns that describe many +different strings. + + To either match or search for a regular expression with the Regex +library functions, you must first compile it with a Regex pattern +compiling function. A "compiled pattern" is a regular expression +converted to the internal format used by the library functions. Once +you've compiled a pattern, you can use it for matching or searching any +number of times. + + The Regex library consists of two source files: `regex.h' and +`regex.c'. Regex provides three groups of functions with which you can +operate on regular expressions. One group--the GNU group--is more +powerful but not completely compatible with the other two, namely the +POSIX and Berkeley UNIX groups; its interface was designed specifically +for GNU. The other groups have the same interfaces as do the regular +expression functions in POSIX and Berkeley UNIX. + + We wrote this chapter with programmers in mind, not users of +programs--such as Emacs--that use Regex. We describe the Regex library +in its entirety, not how to write regular expressions that a particular +program understands. + + +File: regex.info, Node: Regular Expression Syntax, Next: Common Operators, Prev: Overview, Up: Top + +Regular Expression Syntax +************************* + + "Characters" are things you can type. "Operators" are things in a +regular expression that match one or more characters. You compose +regular expressions from operators, which in turn you specify using one +or more characters. + + Most characters represent what we call the match-self operator, i.e., +they match themselves; we call these characters "ordinary". Other +characters represent either all or parts of fancier operators; e.g., +`.' represents what we call the match-any-character operator (which, no +surprise, matches (almost) any character); we call these characters +"special". Two different things determine what characters represent +what operators: + + 1. the regular expression syntax your program has told the Regex + library to recognize, and + + 2. the context of the character in the regular expression. + + In the following sections, we describe these things in more detail. + +* Menu: + +* Syntax Bits:: +* Predefined Syntaxes:: +* Collating Elements vs. Characters:: +* The Backslash Character:: + + +File: regex.info, Node: Syntax Bits, Next: Predefined Syntaxes, Up: Regular Expression Syntax + +Syntax Bits +=========== + + In any particular syntax for regular expressions, some characters are +always special, others are sometimes special, and others are never +special. The particular syntax that Regex recognizes for a given +regular expression depends on the value in the `syntax' field of the +pattern buffer of that regular expression. + + You get a pattern buffer by compiling a regular expression. *Note +GNU Pattern Buffers::, and *Note POSIX Pattern Buffers::, for more +information on pattern buffers. *Note GNU Regular Expression +Compiling::, *Note POSIX Regular Expression Compiling::, and *Note BSD +Regular Expression Compiling::, for more information on compiling. + + Regex considers the value of the `syntax' field to be a collection of +bits; we refer to these bits as "syntax bits". In most cases, they +affect what characters represent what operators. We describe the +meanings of the operators to which we refer in *Note Common Operators::, +*Note GNU Operators::, and *Note GNU Emacs Operators::. + + For reference, here is the complete list of syntax bits, in +alphabetical order: + +`RE_BACKSLASH_ESCAPE_IN_LISTS' + If this bit is set, then `\' inside a list (*note List Operators::. + quotes (makes ordinary, if it's special) the following character; + if this bit isn't set, then `\' is an ordinary character inside + lists. (*Note The Backslash Character::, for what `\' does + outside of lists.) + +`RE_BK_PLUS_QM' + If this bit is set, then `\+' represents the match-one-or-more + operator and `\?' represents the match-zero-or-more operator; if + this bit isn't set, then `+' represents the match-one-or-more + operator and `?' represents the match-zero-or-one operator. This + bit is irrelevant if `RE_LIMITED_OPS' is set. + +`RE_CHAR_CLASSES' + If this bit is set, then you can use character classes in lists; + if this bit isn't set, then you can't. + +`RE_CONTEXT_INDEP_ANCHORS' + If this bit is set, then `^' and `$' are special anywhere outside + a list; if this bit isn't set, then these characters are special + only in certain contexts. *Note Match-beginning-of-line + Operator::, and *Note Match-end-of-line Operator::. + +`RE_CONTEXT_INDEP_OPS' + If this bit is set, then certain characters are special anywhere + outside a list; if this bit isn't set, then those characters are + special only in some contexts and are ordinary elsewhere. + Specifically, if this bit isn't set then `*', and (if the syntax + bit `RE_LIMITED_OPS' isn't set) `+' and `?' (or `\+' and `\?', + depending on the syntax bit `RE_BK_PLUS_QM') represent repetition + operators only if they're not first in a regular expression or + just after an open-group or alternation operator. The same holds + for `{' (or `\{', depending on the syntax bit `RE_NO_BK_BRACES') if + it is the beginning of a valid interval and the syntax bit + `RE_INTERVALS' is set. + +`RE_CONTEXT_INVALID_OPS' + If this bit is set, then repetition and alternation operators + can't be in certain positions within a regular expression. + Specifically, the regular expression is invalid if it has: + + * a repetition operator first in the regular expression or just + after a match-beginning-of-line, open-group, or alternation + operator; or + + * an alternation operator first or last in the regular + expression, just before a match-end-of-line operator, or just + after an alternation or open-group operator. + + If this bit isn't set, then you can put the characters + representing the repetition and alternation characters anywhere in + a regular expression. Whether or not they will in fact be + operators in certain positions depends on other syntax bits. + +`RE_DOT_NEWLINE' + If this bit is set, then the match-any-character operator matches + a newline; if this bit isn't set, then it doesn't. + +`RE_DOT_NOT_NULL' + If this bit is set, then the match-any-character operator doesn't + match a null character; if this bit isn't set, then it does. + +`RE_INTERVALS' + If this bit is set, then Regex recognizes interval operators; if + this bit isn't set, then it doesn't. + +`RE_LIMITED_OPS' + If this bit is set, then Regex doesn't recognize the + match-one-or-more, match-zero-or-one or alternation operators; if + this bit isn't set, then it does. + +`RE_NEWLINE_ALT' + If this bit is set, then newline represents the alternation + operator; if this bit isn't set, then newline is ordinary. + +`RE_NO_BK_BRACES' + If this bit is set, then `{' represents the open-interval operator + and `}' represents the close-interval operator; if this bit isn't + set, then `\{' represents the open-interval operator and `\}' + represents the close-interval operator. This bit is relevant only + if `RE_INTERVALS' is set. + +`RE_NO_BK_PARENS' + If this bit is set, then `(' represents the open-group operator and + `)' represents the close-group operator; if this bit isn't set, + then `\(' represents the open-group operator and `\)' represents + the close-group operator. + +`RE_NO_BK_REFS' + If this bit is set, then Regex doesn't recognize `\'DIGIT as the + back reference operator; if this bit isn't set, then it does. + +`RE_NO_BK_VBAR' + If this bit is set, then `|' represents the alternation operator; + if this bit isn't set, then `\|' represents the alternation + operator. This bit is irrelevant if `RE_LIMITED_OPS' is set. + +`RE_NO_EMPTY_RANGES' + If this bit is set, then a regular expression with a range whose + ending point collates lower than its starting point is invalid; if + this bit isn't set, then Regex considers such a range to be empty. + +`RE_UNMATCHED_RIGHT_PAREN_ORD' + If this bit is set and the regular expression has no matching + open-group operator, then Regex considers what would otherwise be + a close-group operator (based on how `RE_NO_BK_PARENS' is set) to + match `)'. + + +File: regex.info, Node: Predefined Syntaxes, Next: Collating Elements vs. Characters, Prev: Syntax Bits, Up: Regular Expression Syntax + +Predefined Syntaxes +=================== + + If you're programming with Regex, you can set a pattern buffer's +(*note GNU Pattern Buffers::., and *Note POSIX Pattern Buffers::) +`syntax' field either to an arbitrary combination of syntax bits (*note +Syntax Bits::.) or else to the configurations defined by Regex. These +configurations define the syntaxes used by certain programs--GNU Emacs, +POSIX Awk, traditional Awk, Grep, Egrep--in addition to syntaxes for +POSIX basic and extended regular expressions. + + The predefined syntaxes-taken directly from `regex.h'--are: + + #define RE_SYNTAX_EMACS 0 + + #define RE_SYNTAX_AWK \ + (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ + | RE_UNMATCHED_RIGHT_PAREN_ORD) + + #define RE_SYNTAX_POSIX_AWK \ + (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) + + #define RE_SYNTAX_GREP \ + (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ + | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ + | RE_NEWLINE_ALT) + + #define RE_SYNTAX_EGREP \ + (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ + | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ + | RE_NO_BK_VBAR) + + #define RE_SYNTAX_POSIX_EGREP \ + (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) + + /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ + #define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC + + #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC + + /* Syntax bits common to both basic and extended POSIX regex syntax. */ + #define _RE_SYNTAX_POSIX_COMMON \ + (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ + | RE_INTERVALS | RE_NO_EMPTY_RANGES) + + #define RE_SYNTAX_POSIX_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) + + /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes + RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this + isn't minimal, since other operators, such as \`, aren't disabled. */ + #define RE_SYNTAX_POSIX_MINIMAL_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) + + #define RE_SYNTAX_POSIX_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ + | RE_UNMATCHED_RIGHT_PAREN_ORD) + + /* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS + replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ + #define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) + + +File: regex.info, Node: Collating Elements vs. Characters, Next: The Backslash Character, Prev: Predefined Syntaxes, Up: Regular Expression Syntax + +Collating Elements vs. Characters +================================= + + POSIX generalizes the notion of a character to that of a collating +element. It defines a "collating element" to be "a sequence of one or +more bytes defined in the current collating sequence as a unit of +collation." + + This generalizes the notion of a character in two ways. First, a +single character can map into two or more collating elements. For +example, the German "es-zet" collates as the collating element `s' +followed by another collating element `s'. Second, two or more +characters can map into one collating element. For example, the +Spanish `ll' collates after `l' and before `m'. + + Since POSIX's "collating element" preserves the essential idea of a +"character," we use the latter, more familiar, term in this document. + + +File: regex.info, Node: The Backslash Character, Prev: Collating Elements vs. Characters, Up: Regular Expression Syntax + +The Backslash Character +======================= + + The `\' character has one of four different meanings, depending on +the context in which you use it and what syntax bits are set (*note +Syntax Bits::.). It can: 1) stand for itself, 2) quote the next +character, 3) introduce an operator, or 4) do nothing. + + 1. It stands for itself inside a list (*note List Operators::.) if + the syntax bit `RE_BACKSLASH_ESCAPE_IN_LISTS' is not set. For + example, `[\]' would match `\'. + + 2. It quotes (makes ordinary, if it's special) the next character + when you use it either: + + * outside a list,(1) or + + * inside a list and the syntax bit + `RE_BACKSLASH_ESCAPE_IN_LISTS' is set. + + 3. It introduces an operator when followed by certain ordinary + characters--sometimes only when certain syntax bits are set. See + the cases `RE_BK_PLUS_QM', `RE_NO_BK_BRACES', `RE_NO_BK_VAR', + `RE_NO_BK_PARENS', `RE_NO_BK_REF' in *Note Syntax Bits::. Also: + + * `\b' represents the match-word-boundary operator (*note + Match-word-boundary Operator::.). + + * `\B' represents the match-within-word operator (*note + Match-within-word Operator::.). + + * `\<' represents the match-beginning-of-word operator + (*note Match-beginning-of-word Operator::.). + + * `\>' represents the match-end-of-word operator (*note + Match-end-of-word Operator::.). + + * `\w' represents the match-word-constituent operator (*note + Match-word-constituent Operator::.). + + * `\W' represents the match-non-word-constituent operator + (*note Match-non-word-constituent Operator::.). + + * `\`' represents the match-beginning-of-buffer operator and + `\'' represents the match-end-of-buffer operator (*note + Buffer Operators::.). + + * If Regex was compiled with the C preprocessor symbol `emacs' + defined, then `\sCLASS' represents the match-syntactic-class + operator and `\SCLASS' represents the + match-not-syntactic-class operator (*note Syntactic Class + Operators::.). + + 4. In all other cases, Regex ignores `\'. For example, `\n' matches + `n'. + + + ---------- Footnotes ---------- + + (1) Sometimes you don't have to explicitly quote special characters +to make them ordinary. For instance, most characters lose any special +meaning inside a list (*note List Operators::.). In addition, if the +syntax bits `RE_CONTEXT_INVALID_OPS' and `RE_CONTEXT_INDEP_OPS' aren't +set, then (for historical reasons) the matcher considers special +characters ordinary if they are in contexts where the operations they +represent make no sense; for example, then the match-zero-or-more +operator (represented by `*') matches itself in the regular expression +`*foo' because there is no preceding expression on which it can +operate. It is poor practice, however, to depend on this behavior; if +you want a special character to be ordinary outside a list, it's better +to always quote it, regardless. + + +File: regex.info, Node: Common Operators, Next: GNU Operators, Prev: Regular Expression Syntax, Up: Top + +Common Operators +**************** + + You compose regular expressions from operators. In the following +sections, we describe the regular expression operators specified by +POSIX; GNU also uses these. Most operators have more than one +representation as characters. *Note Regular Expression Syntax::, for +what characters represent what operators under what circumstances. + + For most operators that can be represented in two ways, one +representation is a single character and the other is that character +preceded by `\'. For example, either `(' or `\(' represents the +open-group operator. Which one does depends on the setting of a syntax +bit, in this case `RE_NO_BK_PARENS'. Why is this so? Historical +reasons dictate some of the varying representations, while POSIX +dictates others. + + Finally, almost all characters lose any special meaning inside a list +(*note List Operators::.). + +* Menu: + +* Match-self Operator:: Ordinary characters. +* Match-any-character Operator:: . +* Concatenation Operator:: Juxtaposition. +* Repetition Operators:: * + ? {} +* Alternation Operator:: | +* List Operators:: [...] [^...] +* Grouping Operators:: (...) +* Back-reference Operator:: \digit +* Anchoring Operators:: ^ $ + + +File: regex.info, Node: Match-self Operator, Next: Match-any-character Operator, Up: Common Operators + +The Match-self Operator (ORDINARY CHARACTER) +============================================ + + This operator matches the character itself. All ordinary characters +(*note Regular Expression Syntax::.) represent this operator. For +example, `f' is always an ordinary character, so the regular expression +`f' matches only the string `f'. In particular, it does *not* match +the string `ff'. + + +File: regex.info, Node: Match-any-character Operator, Next: Concatenation Operator, Prev: Match-self Operator, Up: Common Operators + +The Match-any-character Operator (`.') +====================================== + + This operator matches any single printing or nonprinting character +except it won't match a: + +newline + if the syntax bit `RE_DOT_NEWLINE' isn't set. + +null + if the syntax bit `RE_DOT_NOT_NULL' is set. + + The `.' (period) character represents this operator. For example, +`a.b' matches any three-character string beginning with `a' and ending +with `b'. + + +File: regex.info, Node: Concatenation Operator, Next: Repetition Operators, Prev: Match-any-character Operator, Up: Common Operators + +The Concatenation Operator +========================== + + This operator concatenates two regular expressions A and B. No +character represents this operator; you simply put B after A. The +result is a regular expression that will match a string if A matches +its first part and B matches the rest. For example, `xy' (two +match-self operators) matches `xy'. + + +File: regex.info, Node: Repetition Operators, Next: Alternation Operator, Prev: Concatenation Operator, Up: Common Operators + +Repetition Operators +==================== + + Repetition operators repeat the preceding regular expression a +specified number of times. + +* Menu: + +* Match-zero-or-more Operator:: * +* Match-one-or-more Operator:: + +* Match-zero-or-one Operator:: ? +* Interval Operators:: {} + + +File: regex.info, Node: Match-zero-or-more Operator, Next: Match-one-or-more Operator, Up: Repetition Operators + +The Match-zero-or-more Operator (`*') +------------------------------------- + + This operator repeats the smallest possible preceding regular +expression as many times as necessary (including zero) to match the +pattern. `*' represents this operator. For example, `o*' matches any +string made up of zero or more `o's. Since this operator operates on +the smallest preceding regular expression, `fo*' has a repeating `o', +not a repeating `fo'. So, `fo*' matches `f', `fo', `foo', and so on. + + Since the match-zero-or-more operator is a suffix operator, it may be +useless as such when no regular expression precedes it. This is the +case when it: + + * is first in a regular expression, or + + * follows a match-beginning-of-line, open-group, or alternation + operator. + +Three different things can happen in these cases: + + 1. If the syntax bit `RE_CONTEXT_INVALID_OPS' is set, then the + regular expression is invalid. + + 2. If `RE_CONTEXT_INVALID_OPS' isn't set, but `RE_CONTEXT_INDEP_OPS' + is, then `*' represents the match-zero-or-more operator (which + then operates on the empty string). + + 3. Otherwise, `*' is ordinary. + + + The matcher processes a match-zero-or-more operator by first matching +as many repetitions of the smallest preceding regular expression as it +can. Then it continues to match the rest of the pattern. + + If it can't match the rest of the pattern, it backtracks (as many +times as necessary), each time discarding one of the matches until it +can either match the entire pattern or be certain that it cannot get a +match. For example, when matching `ca*ar' against `caaar', the matcher +first matches all three `a's of the string with the `a*' of the regular +expression. However, it cannot then match the final `ar' of the +regular expression against the final `r' of the string. So it +backtracks, discarding the match of the last `a' in the string. It can +then match the remaining `ar'. + + +File: regex.info, Node: Match-one-or-more Operator, Next: Match-zero-or-one Operator, Prev: Match-zero-or-more Operator, Up: Repetition Operators + +The Match-one-or-more Operator (`+' or `\+') +-------------------------------------------- + + If the syntax bit `RE_LIMITED_OPS' is set, then Regex doesn't +recognize this operator. Otherwise, if the syntax bit `RE_BK_PLUS_QM' +isn't set, then `+' represents this operator; if it is, then `\+' does. + + This operator is similar to the match-zero-or-more operator except +that it repeats the preceding regular expression at least once; *note +Match-zero-or-more Operator::., for what it operates on, how some +syntax bits affect it, and how Regex backtracks to match it. + + For example, supposing that `+' represents the match-one-or-more +operator; then `ca+r' matches, e.g., `car' and `caaaar', but not `cr'. + + +File: regex.info, Node: Match-zero-or-one Operator, Next: Interval Operators, Prev: Match-one-or-more Operator, Up: Repetition Operators + +The Match-zero-or-one Operator (`?' or `\?') +-------------------------------------------- + + If the syntax bit `RE_LIMITED_OPS' is set, then Regex doesn't +recognize this operator. Otherwise, if the syntax bit `RE_BK_PLUS_QM' +isn't set, then `?' represents this operator; if it is, then `\?' does. + + This operator is similar to the match-zero-or-more operator except +that it repeats the preceding regular expression once or not at all; +*note Match-zero-or-more Operator::., to see what it operates on, how +some syntax bits affect it, and how Regex backtracks to match it. + + For example, supposing that `?' represents the match-zero-or-one +operator; then `ca?r' matches both `car' and `cr', but nothing else. + + +File: regex.info, Node: Interval Operators, Prev: Match-zero-or-one Operator, Up: Repetition Operators + +Interval Operators (`{' ... `}' or `\{' ... `\}') +------------------------------------------------- + + If the syntax bit `RE_INTERVALS' is set, then Regex recognizes +"interval expressions". They repeat the smallest possible preceding +regular expression a specified number of times. + + If the syntax bit `RE_NO_BK_BRACES' is set, `{' represents the +"open-interval operator" and `}' represents the "close-interval +operator" ; otherwise, `\{' and `\}' do. + + Specifically, supposing that `{' and `}' represent the open-interval +and close-interval operators; then: + +`{COUNT}' + matches exactly COUNT occurrences of the preceding regular + expression. + +`{MIN,}' + matches MIN or more occurrences of the preceding regular + expression. + +`{MIN, MAX}' + matches at least MIN but no more than MAX occurrences of the + preceding regular expression. + + The interval expression (but not necessarily the regular expression +that contains it) is invalid if: + + * MIN is greater than MAX, or + + * any of COUNT, MIN, or MAX are outside the range zero to + `RE_DUP_MAX' (which symbol `regex.h' defines). + + If the interval expression is invalid and the syntax bit +`RE_NO_BK_BRACES' is set, then Regex considers all the characters in +the would-be interval to be ordinary. If that bit isn't set, then the +regular expression is invalid. + + If the interval expression is valid but there is no preceding regular +expression on which to operate, then if the syntax bit +`RE_CONTEXT_INVALID_OPS' is set, the regular expression is invalid. If +that bit isn't set, then Regex considers all the characters--other than +backslashes, which it ignores--in the would-be interval to be ordinary. + + +File: regex.info, Node: Alternation Operator, Next: List Operators, Prev: Repetition Operators, Up: Common Operators + +The Alternation Operator (`|' or `\|') +====================================== + + If the syntax bit `RE_LIMITED_OPS' is set, then Regex doesn't +recognize this operator. Otherwise, if the syntax bit `RE_NO_BK_VBAR' +is set, then `|' represents this operator; otherwise, `\|' does. + + Alternatives match one of a choice of regular expressions: if you put +the character(s) representing the alternation operator between any two +regular expressions A and B, the result matches the union of the +strings that A and B match. For example, supposing that `|' is the +alternation operator, then `foo|bar|quux' would match any of `foo', +`bar' or `quux'. + + The alternation operator operates on the *largest* possible +surrounding regular expressions. (Put another way, it has the lowest +precedence of any regular expression operator.) Thus, the only way you +can delimit its arguments is to use grouping. For example, if `(' and +`)' are the open and close-group operators, then `fo(o|b)ar' would +match either `fooar' or `fobar'. (`foo|bar' would match `foo' or +`bar'.) + + The matcher usually tries all combinations of alternatives so as to +match the longest possible string. For example, when matching +`(fooq|foo)*(qbarquux|bar)' against `fooqbarquux', it cannot take, say, +the first ("depth-first") combination it could match, since then it +would be content to match just `fooqbar'. + + +File: regex.info, Node: List Operators, Next: Grouping Operators, Prev: Alternation Operator, Up: Common Operators + +List Operators (`[' ... `]' and `[^' ... `]') +============================================= + + "Lists", also called "bracket expressions", are a set of one or more +items. An "item" is a character, a character class expression, or a +range expression. The syntax bits affect which kinds of items you can +put in a list. We explain the last two items in subsections below. +Empty lists are invalid. + + A "matching list" matches a single character represented by one of +the list items. You form a matching list by enclosing one or more items +within an "open-matching-list operator" (represented by `[') and a +"close-list operator" (represented by `]'). + + For example, `[ab]' matches either `a' or `b'. `[ad]*' matches the +empty string and any string composed of just `a's and `d's in any +order. Regex considers invalid a regular expression with a `[' but no +matching `]'. + + "Nonmatching lists" are similar to matching lists except that they +match a single character *not* represented by one of the list items. +You use an "open-nonmatching-list operator" (represented by `[^'(1)) +instead of an open-matching-list operator to start a nonmatching list. + + For example, `[^ab]' matches any character except `a' or `b'. + + If the `posix_newline' field in the pattern buffer (*note GNU Pattern +Buffers::. is set, then nonmatching lists do not match a newline. + + Most characters lose any special meaning inside a list. The special +characters inside a list follow. + +`]' + ends the list if it's not the first list item. So, if you want to + make the `]' character a list item, you must put it first. + +`\' + quotes the next character if the syntax bit + `RE_BACKSLASH_ESCAPE_IN_LISTS' is set. + +`[:' + represents the open-character-class operator (*note Character + Class Operators::.) if the syntax bit `RE_CHAR_CLASSES' is set and + what follows is a valid character class expression. + +`:]' + represents the close-character-class operator if the syntax bit + `RE_CHAR_CLASSES' is set and what precedes it is an + open-character-class operator followed by a valid character class + name. + +`-' + represents the range operator (*note Range Operator::.) if it's + not first or last in a list or the ending point of a range. + +All other characters are ordinary. For example, `[.*]' matches `.' and +`*'. + +* Menu: + +* Character Class Operators:: [:class:] +* Range Operator:: start-end + + ---------- Footnotes ---------- + + (1) Regex therefore doesn't consider the `^' to be the first +character in the list. If you put a `^' character first in (what you +think is) a matching list, you'll turn it into a nonmatching list. + + +File: regex.info, Node: Character Class Operators, Next: Range Operator, Up: List Operators + +Character Class Operators (`[:' ... `:]') +----------------------------------------- + + If the syntax bit `RE_CHARACTER_CLASSES' is set, then Regex +recognizes character class expressions inside lists. A "character +class expression" matches one character from a given class. You form a +character class expression by putting a character class name between an +"open-character-class operator" (represented by `[:') and a +"close-character-class operator" (represented by `:]'). The character +class names and their meanings are: + +`alnum' + letters and digits + +`alpha' + letters + +`blank' + system-dependent; for GNU, a space or tab + +`cntrl' + control characters (in the ASCII encoding, code 0177 and codes + less than 040) + +`digit' + digits + +`graph' + same as `print' except omits space + +`lower' + lowercase letters + +`print' + printable characters (in the ASCII encoding, space tilde--codes + 040 through 0176) + +`punct' + neither control nor alphanumeric characters + +`space' + space, carriage return, newline, vertical tab, and form feed + +`upper' + uppercase letters + +`xdigit' + hexadecimal digits: `0'-`9', `a'-`f', `A'-`F' + +These correspond to the definitions in the C library's `<ctype.h>' +facility. For example, `[:alpha:]' corresponds to the standard +facility `isalpha'. Regex recognizes character class expressions only +inside of lists; so `[[:alpha:]]' matches any letter, but `[:alpha:]' +outside of a bracket expression and not followed by a repetition +operator matches just itself. + + +File: regex.info, Node: Range Operator, Prev: Character Class Operators, Up: List Operators + +The Range Operator (`-') +------------------------ + + Regex recognizes "range expressions" inside a list. They represent +those characters that fall between two elements in the current +collating sequence. You form a range expression by putting a "range +operator" between two characters.(1) `-' represents the range operator. +For example, `a-f' within a list represents all the characters from `a' +through `f' inclusively. + + If the syntax bit `RE_NO_EMPTY_RANGES' is set, then if the range's +ending point collates less than its starting point, the range (and the +regular expression containing it) is invalid. For example, the regular +expression `[z-a]' would be invalid. If this bit isn't set, then Regex +considers such a range to be empty. + + Since `-' represents the range operator, if you want to make a `-' +character itself a list item, you must do one of the following: + + * Put the `-' either first or last in the list. + + * Include a range whose starting point collates strictly lower than + `-' and whose ending point collates equal or higher. Unless a + range is the first item in a list, a `-' can't be its starting + point, but *can* be its ending point. That is because Regex + considers `-' to be the range operator unless it is preceded by + another `-'. For example, in the ASCII encoding, `)', `*', `+', + `,', `-', `.', and `/' are contiguous characters in the collating + sequence. You might think that `[)-+--/]' has two ranges: `)-+' + and `--/'. Rather, it has the ranges `)-+' and `+--', plus the + character `/', so it matches, e.g., `,', not `.'. + + * Put a range whose starting point is `-' first in the list. + + For example, `[-a-z]' matches a lowercase letter or a hyphen (in +English, in ASCII). + + ---------- Footnotes ---------- + + (1) You can't use a character class for the starting or ending point +of a range, since a character class is not a single character. + + +File: regex.info, Node: Grouping Operators, Next: Back-reference Operator, Prev: List Operators, Up: Common Operators + +Grouping Operators (`(' ... `)' or `\(' ... `\)') +================================================= + + A "group", also known as a "subexpression", consists of an +"open-group operator", any number of other operators, and a +"close-group operator". Regex treats this sequence as a unit, just as +mathematics and programming languages treat a parenthesized expression +as a unit. + + Therefore, using "groups", you can: + + * delimit the argument(s) to an alternation operator (*note + Alternation Operator::.) or a repetition operator (*note + Repetition Operators::.). + + * keep track of the indices of the substring that matched a given + group. *Note Using Registers::, for a precise explanation. This + lets you: + + * use the back-reference operator (*note Back-reference + Operator::.). + + * use registers (*note Using Registers::.). + + If the syntax bit `RE_NO_BK_PARENS' is set, then `(' represents the +open-group operator and `)' represents the close-group operator; +otherwise, `\(' and `\)' do. + + If the syntax bit `RE_UNMATCHED_RIGHT_PAREN_ORD' is set and a +close-group operator has no matching open-group operator, then Regex +considers it to match `)'. + + +File: regex.info, Node: Back-reference Operator, Next: Anchoring Operators, Prev: Grouping Operators, Up: Common Operators + +The Back-reference Operator ("\"DIGIT) +====================================== + + If the syntax bit `RE_NO_BK_REF' isn't set, then Regex recognizes +back references. A back reference matches a specified preceding group. +The back reference operator is represented by `\DIGIT' anywhere after +the end of a regular expression's DIGIT-th group (*note Grouping +Operators::.). + + DIGIT must be between `1' and `9'. The matcher assigns numbers 1 +through 9 to the first nine groups it encounters. By using one of `\1' +through `\9' after the corresponding group's close-group operator, you +can match a substring identical to the one that the group does. + + Back references match according to the following (in all examples +below, `(' represents the open-group, `)' the close-group, `{' the +open-interval and `}' the close-interval operator): + + * If the group matches a substring, the back reference matches an + identical substring. For example, `(a)\1' matches `aa' and + `(bana)na\1bo\1' matches `bananabanabobana'. Likewise, `(.*)\1' + matches any (newline-free if the syntax bit `RE_DOT_NEWLINE' isn't + set) string that is composed of two identical halves; the `(.*)' + matches the first half and the `\1' matches the second half. + + * If the group matches more than once (as it might if followed by, + e.g., a repetition operator), then the back reference matches the + substring the group *last* matched. For example, `((a*)b)*\1\2' + matches `aabababa'; first group 1 (the outer one) matches `aab' + and group 2 (the inner one) matches `aa'. Then group 1 matches + `ab' and group 2 matches `a'. So, `\1' matches `ab' and `\2' + matches `a'. + + * If the group doesn't participate in a match, i.e., it is part of an + alternative not taken or a repetition operator allows zero + repetitions of it, then the back reference makes the whole match + fail. For example, `(one()|two())-and-(three\2|four\3)' matches + `one-and-three' and `two-and-four', but not `one-and-four' or + `two-and-three'. For example, if the pattern matches `one-and-', + then its group 2 matches the empty string and its group 3 doesn't + participate in the match. So, if it then matches `four', then + when it tries to back reference group 3--which it will attempt to + do because `\3' follows the `four'--the match will fail because + group 3 didn't participate in the match. + + You can use a back reference as an argument to a repetition operator. +For example, `(a(b))\2*' matches `a' followed by two or more `b's. +Similarly, `(a(b))\2{3}' matches `abbbb'. + + If there is no preceding DIGIT-th subexpression, the regular +expression is invalid. + + +File: regex.info, Node: Anchoring Operators, Prev: Back-reference Operator, Up: Common Operators + +Anchoring Operators +=================== + + These operators can constrain a pattern to match only at the +beginning or end of the entire string or at the beginning or end of a +line. + +* Menu: + +* Match-beginning-of-line Operator:: ^ +* Match-end-of-line Operator:: $ + + +File: regex.info, Node: Match-beginning-of-line Operator, Next: Match-end-of-line Operator, Up: Anchoring Operators + +The Match-beginning-of-line Operator (`^') +------------------------------------------ + + This operator can match the empty string either at the beginning of +the string or after a newline character. Thus, it is said to "anchor" +the pattern to the beginning of a line. + + In the cases following, `^' represents this operator. (Otherwise, +`^' is ordinary.) + + * It (the `^') is first in the pattern, as in `^foo'. + + * The syntax bit `RE_CONTEXT_INDEP_ANCHORS' is set, and it is outside + a bracket expression. + + * It follows an open-group or alternation operator, as in `a\(^b\)' + and `a\|^b'. *Note Grouping Operators::, and *Note Alternation + Operator::. + + These rules imply that some valid patterns containing `^' cannot be +matched; for example, `foo^bar' if `RE_CONTEXT_INDEP_ANCHORS' is set. + + If the `not_bol' field is set in the pattern buffer (*note GNU +Pattern Buffers::.), then `^' fails to match at the beginning of the +string. *Note POSIX Matching::, for when you might find this useful. + + If the `newline_anchor' field is set in the pattern buffer, then `^' +fails to match after a newline. This is useful when you do not regard +the string to be matched as broken into lines. + + +File: regex.info, Node: Match-end-of-line Operator, Prev: Match-beginning-of-line Operator, Up: Anchoring Operators + +The Match-end-of-line Operator (`$') +------------------------------------ + + This operator can match the empty string either at the end of the +string or before a newline character in the string. Thus, it is said +to "anchor" the pattern to the end of a line. + + It is always represented by `$'. For example, `foo$' usually +matches, e.g., `foo' and, e.g., the first three characters of +`foo\nbar'. + + Its interaction with the syntax bits and pattern buffer fields is +exactly the dual of `^''s; see the previous section. (That is, +"beginning" becomes "end", "next" becomes "previous", and "after" +becomes "before".) + + +File: regex.info, Node: GNU Operators, Next: GNU Emacs Operators, Prev: Common Operators, Up: Top + +GNU Operators +************* + + Following are operators that GNU defines (and POSIX doesn't). + +* Menu: + +* Word Operators:: +* Buffer Operators:: + + +File: regex.info, Node: Word Operators, Next: Buffer Operators, Up: GNU Operators + +Word Operators +============== + + The operators in this section require Regex to recognize parts of +words. Regex uses a syntax table to determine whether or not a +character is part of a word, i.e., whether or not it is +"word-constituent". + +* Menu: + +* Non-Emacs Syntax Tables:: +* Match-word-boundary Operator:: \b +* Match-within-word Operator:: \B +* Match-beginning-of-word Operator:: \< +* Match-end-of-word Operator:: \> +* Match-word-constituent Operator:: \w +* Match-non-word-constituent Operator:: \W + + +File: regex.info, Node: Non-Emacs Syntax Tables, Next: Match-word-boundary Operator, Up: Word Operators + +Non-Emacs Syntax Tables +----------------------- + + A "syntax table" is an array indexed by the characters in your +character set. In the ASCII encoding, therefore, a syntax table has +256 elements. Regex always uses a `char *' variable `re_syntax_table' +as its syntax table. In some cases, it initializes this variable and +in others it expects you to initialize it. + + * If Regex is compiled with the preprocessor symbols `emacs' and + `SYNTAX_TABLE' both undefined, then Regex allocates + `re_syntax_table' and initializes an element I either to `Sword' + (which it defines) if I is a letter, number, or `_', or to zero if + it's not. + + * If Regex is compiled with `emacs' undefined but `SYNTAX_TABLE' + defined, then Regex expects you to define a `char *' variable + `re_syntax_table' to be a valid syntax table. + + * *Note Emacs Syntax Tables::, for what happens when Regex is + compiled with the preprocessor symbol `emacs' defined. + + +File: regex.info, Node: Match-word-boundary Operator, Next: Match-within-word Operator, Prev: Non-Emacs Syntax Tables, Up: Word Operators + +The Match-word-boundary Operator (`\b') +--------------------------------------- + + This operator (represented by `\b') matches the empty string at +either the beginning or the end of a word. For example, `\brat\b' +matches the separate word `rat'. + + +File: regex.info, Node: Match-within-word Operator, Next: Match-beginning-of-word Operator, Prev: Match-word-boundary Operator, Up: Word Operators + +The Match-within-word Operator (`\B') +------------------------------------- + + This operator (represented by `\B') matches the empty string within a +word. For example, `c\Brat\Be' matches `crate', but `dirty \Brat' +doesn't match `dirty rat'. + + +File: regex.info, Node: Match-beginning-of-word Operator, Next: Match-end-of-word Operator, Prev: Match-within-word Operator, Up: Word Operators + +The Match-beginning-of-word Operator (`\<') +------------------------------------------- + + This operator (represented by `\<') matches the empty string at the +beginning of a word. + + +File: regex.info, Node: Match-end-of-word Operator, Next: Match-word-constituent Operator, Prev: Match-beginning-of-word Operator, Up: Word Operators + +The Match-end-of-word Operator (`\>') +------------------------------------- + + This operator (represented by `\>') matches the empty string at the +end of a word. + + +File: regex.info, Node: Match-word-constituent Operator, Next: Match-non-word-constituent Operator, Prev: Match-end-of-word Operator, Up: Word Operators + +The Match-word-constituent Operator (`\w') +------------------------------------------ + + This operator (represented by `\w') matches any word-constituent +character. + + +File: regex.info, Node: Match-non-word-constituent Operator, Prev: Match-word-constituent Operator, Up: Word Operators + +The Match-non-word-constituent Operator (`\W') +---------------------------------------------- + + This operator (represented by `\W') matches any character that is not +word-constituent. + + +File: regex.info, Node: Buffer Operators, Prev: Word Operators, Up: GNU Operators + +Buffer Operators +================ + + Following are operators which work on buffers. In Emacs, a "buffer" +is, naturally, an Emacs buffer. For other programs, Regex considers the +entire string to be matched as the buffer. + +* Menu: + +* Match-beginning-of-buffer Operator:: \` +* Match-end-of-buffer Operator:: \' + + +File: regex.info, Node: Match-beginning-of-buffer Operator, Next: Match-end-of-buffer Operator, Up: Buffer Operators + +The Match-beginning-of-buffer Operator (`\`') +--------------------------------------------- + + This operator (represented by `\`') matches the empty string at the +beginning of the buffer. + + +File: regex.info, Node: Match-end-of-buffer Operator, Prev: Match-beginning-of-buffer Operator, Up: Buffer Operators + +The Match-end-of-buffer Operator (`\'') +--------------------------------------- + + This operator (represented by `\'') matches the empty string at the +end of the buffer. + + +File: regex.info, Node: GNU Emacs Operators, Next: What Gets Matched?, Prev: GNU Operators, Up: Top + +GNU Emacs Operators +******************* + + Following are operators that GNU defines (and POSIX doesn't) that you +can use only when Regex is compiled with the preprocessor symbol +`emacs' defined. + +* Menu: + +* Syntactic Class Operators:: + + +File: regex.info, Node: Syntactic Class Operators, Up: GNU Emacs Operators + +Syntactic Class Operators +========================= + + The operators in this section require Regex to recognize the syntactic +classes of characters. Regex uses a syntax table to determine this. + +* Menu: + +* Emacs Syntax Tables:: +* Match-syntactic-class Operator:: \sCLASS +* Match-not-syntactic-class Operator:: \SCLASS + + +File: regex.info, Node: Emacs Syntax Tables, Next: Match-syntactic-class Operator, Up: Syntactic Class Operators + +Emacs Syntax Tables +------------------- + + A "syntax table" is an array indexed by the characters in your +character set. In the ASCII encoding, therefore, a syntax table has +256 elements. + + If Regex is compiled with the preprocessor symbol `emacs' defined, +then Regex expects you to define and initialize the variable +`re_syntax_table' to be an Emacs syntax table. Emacs' syntax tables +are more complicated than Regex's own (*note Non-Emacs Syntax +Tables::.). *Note Syntax: (emacs)Syntax, for a description of Emacs' +syntax tables. + + +File: regex.info, Node: Match-syntactic-class Operator, Next: Match-not-syntactic-class Operator, Prev: Emacs Syntax Tables, Up: Syntactic Class Operators + +The Match-syntactic-class Operator (`\s'CLASS) +---------------------------------------------- + + This operator matches any character whose syntactic class is +represented by a specified character. `\sCLASS' represents this +operator where CLASS is the character representing the syntactic class +you want. For example, `w' represents the syntactic class of +word-constituent characters, so `\sw' matches any word-constituent +character. + + +File: regex.info, Node: Match-not-syntactic-class Operator, Prev: Match-syntactic-class Operator, Up: Syntactic Class Operators + +The Match-not-syntactic-class Operator (`\S'CLASS) +-------------------------------------------------- + + This operator is similar to the match-syntactic-class operator except +that it matches any character whose syntactic class is *not* +represented by the specified character. `\SCLASS' represents this +operator. For example, `w' represents the syntactic class of +word-constituent characters, so `\Sw' matches any character that is not +word-constituent. + + +File: regex.info, Node: What Gets Matched?, Next: Programming with Regex, Prev: GNU Emacs Operators, Up: Top + +What Gets Matched? +****************** + + Regex usually matches strings according to the "leftmost longest" +rule; that is, it chooses the longest of the leftmost matches. This +does not mean that for a regular expression containing subexpressions +that it simply chooses the longest match for each subexpression, left to +right; the overall match must also be the longest possible one. + + For example, `(ac*)(c*d[ac]*)\1' matches `acdacaaa', not `acdac', as +it would if it were to choose the longest match for the first +subexpression. + + +File: regex.info, Node: Programming with Regex, Next: Copying, Prev: What Gets Matched?, Up: Top + +Programming with Regex +********************** + + Here we describe how you use the Regex data structures and functions +in C programs. Regex has three interfaces: one designed for GNU, one +compatible with POSIX and one compatible with Berkeley UNIX. + +* Menu: + +* GNU Regex Functions:: +* POSIX Regex Functions:: +* BSD Regex Functions:: + + +File: regex.info, Node: GNU Regex Functions, Next: POSIX Regex Functions, Up: Programming with Regex + +GNU Regex Functions +=================== + + If you're writing code that doesn't need to be compatible with either +POSIX or Berkeley UNIX, you can use these functions. They provide more +options than the other interfaces. + +* Menu: + +* GNU Pattern Buffers:: The re_pattern_buffer type. +* GNU Regular Expression Compiling:: re_compile_pattern () +* GNU Matching:: re_match () +* GNU Searching:: re_search () +* Matching/Searching with Split Data:: re_match_2 (), re_search_2 () +* Searching with Fastmaps:: re_compile_fastmap () +* GNU Translate Tables:: The `translate' field. +* Using Registers:: The re_registers type and related fns. +* Freeing GNU Pattern Buffers:: regfree () + + +File: regex.info, Node: GNU Pattern Buffers, Next: GNU Regular Expression Compiling, Up: GNU Regex Functions + +GNU Pattern Buffers +------------------- + + To compile, match, or search for a given regular expression, you must +supply a pattern buffer. A "pattern buffer" holds one compiled regular +expression.(1) + + You can have several different pattern buffers simultaneously, each +holding a compiled pattern for a different regular expression. + + `regex.h' defines the pattern buffer `struct' as follows: + + /* Space that holds the compiled pattern. It is declared as + `unsigned char *' because its elements are + sometimes used as array indexes. */ + unsigned char *buffer; + + /* Number of bytes to which `buffer' points. */ + unsigned long allocated; + + /* Number of bytes actually used in `buffer'. */ + unsigned long used; + + /* Syntax setting with which the pattern was compiled. */ + reg_syntax_t syntax; + + /* Pointer to a fastmap, if any, otherwise zero. re_search uses + the fastmap, if there is one, to skip over impossible + starting points for matches. */ + char *fastmap; + + /* Either a translate table to apply to all characters before + comparing them, or zero for no translation. The translation + is applied to a pattern when it is compiled and to a string + when it is matched. */ + char *translate; + + /* Number of subexpressions found by the compiler. */ + size_t re_nsub; + + /* Zero if this pattern cannot match the empty string, one else. + Well, in truth it's used only in `re_search_2', to see + whether or not we should use the fastmap, so we don't set + this absolutely perfectly; see `re_compile_fastmap' (the + `duplicate' case). */ + unsigned can_be_null : 1; + + /* If REGS_UNALLOCATED, allocate space in the `regs' structure + for `max (RE_NREGS, re_nsub + 1)' groups. + If REGS_REALLOCATE, reallocate space if necessary. + If REGS_FIXED, use what's there. */ + #define REGS_UNALLOCATED 0 + #define REGS_REALLOCATE 1 + #define REGS_FIXED 2 + unsigned regs_allocated : 2; + + /* Set to zero when `regex_compile' compiles a pattern; set to one + by `re_compile_fastmap' if it updates the fastmap. */ + unsigned fastmap_accurate : 1; + + /* If set, `re_match_2' does not return information about + subexpressions. */ + unsigned no_sub : 1; + + /* If set, a beginning-of-line anchor doesn't match at the + beginning of the string. */ + unsigned not_bol : 1; + + /* Similarly for an end-of-line anchor. */ + unsigned not_eol : 1; + + /* If true, an anchor at a newline matches. */ + unsigned newline_anchor : 1; + + ---------- Footnotes ---------- + + (1) Regular expressions are also referred to as "patterns," hence +the name "pattern buffer." + + +File: regex.info, Node: GNU Regular Expression Compiling, Next: GNU Matching, Prev: GNU Pattern Buffers, Up: GNU Regex Functions + +GNU Regular Expression Compiling +-------------------------------- + + In GNU, you can both match and search for a given regular expression. +To do either, you must first compile it in a pattern buffer (*note GNU +Pattern Buffers::.). + + Regular expressions match according to the syntax with which they were +compiled; with GNU, you indicate what syntax you want by setting the +variable `re_syntax_options' (declared in `regex.h' and defined in +`regex.c') before calling the compiling function, `re_compile_pattern' +(see below). *Note Syntax Bits::, and *Note Predefined Syntaxes::. + + You can change the value of `re_syntax_options' at any time. +Usually, however, you set its value once and then never change it. + + `re_compile_pattern' takes a pattern buffer as an argument. You must +initialize the following fields: + +`translate initialization' +`translate' + Initialize this to point to a translate table if you want one, or + to zero if you don't. We explain translate tables in *Note GNU + Translate Tables::. + +`fastmap' + Initialize this to nonzero if you want a fastmap, or to zero if you + don't. + +`buffer' +`allocated' + If you want `re_compile_pattern' to allocate memory for the + compiled pattern, set both of these to zero. If you have an + existing block of memory (allocated with `malloc') you want Regex + to use, set `buffer' to its address and `allocated' to its size (in + bytes). + + `re_compile_pattern' uses `realloc' to extend the space for the + compiled pattern as necessary. + + To compile a pattern buffer, use: + + char * + re_compile_pattern (const char *REGEX, const int REGEX_SIZE, + struct re_pattern_buffer *PATTERN_BUFFER) + +REGEX is the regular expression's address, REGEX_SIZE is its length, +and PATTERN_BUFFER is the pattern buffer's address. + + If `re_compile_pattern' successfully compiles the regular expression, +it returns zero and sets `*PATTERN_BUFFER' to the compiled pattern. It +sets the pattern buffer's fields as follows: + +`buffer' + to the compiled pattern. + +`used' + to the number of bytes the compiled pattern in `buffer' occupies. + +`syntax' + to the current value of `re_syntax_options'. + +`re_nsub' + to the number of subexpressions in REGEX. + +`fastmap_accurate' + to zero on the theory that the pattern you're compiling is + different than the one previously compiled into `buffer'; in that + case (since you can't make a fastmap without a compiled pattern), + `fastmap' would either contain an incompatible fastmap, or nothing + at all. + + If `re_compile_pattern' can't compile REGEX, it returns an error +string corresponding to one of the errors listed in *Note POSIX Regular +Expression Compiling::. + + +File: regex.info, Node: GNU Matching, Next: GNU Searching, Prev: GNU Regular Expression Compiling, Up: GNU Regex Functions + +GNU Matching +------------ + + Matching the GNU way means trying to match as much of a string as +possible starting at a position within it you specify. Once you've +compiled a pattern into a pattern buffer (*note GNU Regular Expression +Compiling::.), you can ask the matcher to match that pattern against a +string using: + + int + re_match (struct re_pattern_buffer *PATTERN_BUFFER, + const char *STRING, const int SIZE, + const int START, struct re_registers *REGS) + +PATTERN_BUFFER is the address of a pattern buffer containing a compiled +pattern. STRING is the string you want to match; it can contain +newline and null characters. SIZE is the length of that string. START +is the string index at which you want to begin matching; the first +character of STRING is at index zero. *Note Using Registers::, for a +explanation of REGS; you can safely pass zero. + + `re_match' matches the regular expression in PATTERN_BUFFER against +the string STRING according to the syntax in PATTERN_BUFFERS's `syntax' +field. (*Note GNU Regular Expression Compiling::, for how to set it.) +The function returns -1 if the compiled pattern does not match any part +of STRING and -2 if an internal error happens; otherwise, it returns +how many (possibly zero) characters of STRING the pattern matched. + + An example: suppose PATTERN_BUFFER points to a pattern buffer +containing the compiled pattern for `a*', and STRING points to `aaaaab' +(whereupon SIZE should be 6). Then if START is 2, `re_match' returns 3, +i.e., `a*' would have matched the last three `a's in STRING. If START +is 0, `re_match' returns 5, i.e., `a*' would have matched all the `a's +in STRING. If START is either 5 or 6, it returns zero. + + If START is not between zero and SIZE, then `re_match' returns -1. + + +File: regex.info, Node: GNU Searching, Next: Matching/Searching with Split Data, Prev: GNU Matching, Up: GNU Regex Functions + +GNU Searching +------------- + + "Searching" means trying to match starting at successive positions +within a string. The function `re_search' does this. + + Before calling `re_search', you must compile your regular expression. +*Note GNU Regular Expression Compiling::. + + Here is the function declaration: + + int + re_search (struct re_pattern_buffer *PATTERN_BUFFER, + const char *STRING, const int SIZE, + const int START, const int RANGE, + struct re_registers *REGS) + +whose arguments are the same as those to `re_match' (*note GNU +Matching::.) except that the two arguments START and RANGE replace +`re_match''s argument START. + + If RANGE is positive, then `re_search' attempts a match starting +first at index START, then at START + 1 if that fails, and so on, up to +START + RANGE; if RANGE is negative, then it attempts a match starting +first at index START, then at START -1 if that fails, and so on. + + If START is not between zero and SIZE, then `re_search' returns -1. +When RANGE is positive, `re_search' adjusts RANGE so that START + RANGE +- 1 is between zero and SIZE, if necessary; that way it won't search +outside of STRING. Similarly, when RANGE is negative, `re_search' +adjusts RANGE so that START + RANGE + 1 is between zero and SIZE, if +necessary. + + If the `fastmap' field of PATTERN_BUFFER is zero, `re_search' matches +starting at consecutive positions; otherwise, it uses `fastmap' to make +the search more efficient. *Note Searching with Fastmaps::. + + If no match is found, `re_search' returns -1. If a match is found, +it returns the index where the match began. If an internal error +happens, it returns -2. + + +File: regex.info, Node: Matching/Searching with Split Data, Next: Searching with Fastmaps, Prev: GNU Searching, Up: GNU Regex Functions + +Matching and Searching with Split Data +-------------------------------------- + + Using the functions `re_match_2' and `re_search_2', you can match or +search in data that is divided into two strings. + + The function: + + int + re_match_2 (struct re_pattern_buffer *BUFFER, + const char *STRING1, const int SIZE1, + const char *STRING2, const int SIZE2, + const int START, + struct re_registers *REGS, + const int STOP) + +is similar to `re_match' (*note GNU Matching::.) except that you pass +*two* data strings and sizes, and an index STOP beyond which you don't +want the matcher to try matching. As with `re_match', if it succeeds, +`re_match_2' returns how many characters of STRING it matched. Regard +STRING1 and STRING2 as concatenated when you set the arguments START and +STOP and use the contents of REGS; `re_match_2' never returns a value +larger than SIZE1 + SIZE2. + + The function: + + int + re_search_2 (struct re_pattern_buffer *BUFFER, + const char *STRING1, const int SIZE1, + const char *STRING2, const int SIZE2, + const int START, const int RANGE, + struct re_registers *REGS, + const int STOP) + +is similarly related to `re_search'. + + +File: regex.info, Node: Searching with Fastmaps, Next: GNU Translate Tables, Prev: Matching/Searching with Split Data, Up: GNU Regex Functions + +Searching with Fastmaps +----------------------- + + If you're searching through a long string, you should use a fastmap. +Without one, the searcher tries to match at consecutive positions in the +string. Generally, most of the characters in the string could not start +a match. It takes much longer to try matching at a given position in +the string than it does to check in a table whether or not the +character at that position could start a match. A "fastmap" is such a +table. + + More specifically, a fastmap is an array indexed by the characters in +your character set. Under the ASCII encoding, therefore, a fastmap has +256 elements. If you want the searcher to use a fastmap with a given +pattern buffer, you must allocate the array and assign the array's +address to the pattern buffer's `fastmap' field. You either can +compile the fastmap yourself or have `re_search' do it for you; when +`fastmap' is nonzero, it automatically compiles a fastmap the first +time you search using a particular compiled pattern. + + To compile a fastmap yourself, use: + + int + re_compile_fastmap (struct re_pattern_buffer *PATTERN_BUFFER) + +PATTERN_BUFFER is the address of a pattern buffer. If the character C +could start a match for the pattern, `re_compile_fastmap' makes +`PATTERN_BUFFER->fastmap[C]' nonzero. It returns 0 if it can compile a +fastmap and -2 if there is an internal error. For example, if `|' is +the alternation operator and PATTERN_BUFFER holds the compiled pattern +for `a|b', then `re_compile_fastmap' sets `fastmap['a']' and +`fastmap['b']' (and no others). + + `re_search' uses a fastmap as it moves along in the string: it checks +the string's characters until it finds one that's in the fastmap. Then +it tries matching at that character. If the match fails, it repeats +the process. So, by using a fastmap, `re_search' doesn't waste time +trying to match at positions in the string that couldn't start a match. + + If you don't want `re_search' to use a fastmap, store zero in the +`fastmap' field of the pattern buffer before calling `re_search'. + + Once you've initialized a pattern buffer's `fastmap' field, you need +never do so again--even if you compile a new pattern in it--provided +the way the field is set still reflects whether or not you want a +fastmap. `re_search' will still either do nothing if `fastmap' is null +or, if it isn't, compile a new fastmap for the new pattern. + + +File: regex.info, Node: GNU Translate Tables, Next: Using Registers, Prev: Searching with Fastmaps, Up: GNU Regex Functions + +GNU Translate Tables +-------------------- + + If you set the `translate' field of a pattern buffer to a translate +table, then the GNU Regex functions to which you've passed that pattern +buffer use it to apply a simple transformation to all the regular +expression and string characters at which they look. + + A "translate table" is an array indexed by the characters in your +character set. Under the ASCII encoding, therefore, a translate table +has 256 elements. The array's elements are also characters in your +character set. When the Regex functions see a character C, they use +`translate[C]' in its place, with one exception: the character after a +`\' is not translated. (This ensures that, the operators, e.g., `\B' +and `\b', are always distinguishable.) + + For example, a table that maps all lowercase letters to the +corresponding uppercase ones would cause the matcher to ignore +differences in case.(1) Such a table would map all characters except +lowercase letters to themselves, and lowercase letters to the +corresponding uppercase ones. Under the ASCII encoding, here's how you +could initialize such a table (we'll call it `case_fold'): + + for (i = 0; i < 256; i++) + case_fold[i] = i; + for (i = 'a'; i <= 'z'; i++) + case_fold[i] = i - ('a' - 'A'); + + You tell Regex to use a translate table on a given pattern buffer by +assigning that table's address to the `translate' field of that buffer. +If you don't want Regex to do any translation, put zero into this +field. You'll get weird results if you change the table's contents +anytime between compiling the pattern buffer, compiling its fastmap, and +matching or searching with the pattern buffer. + + ---------- Footnotes ---------- + + (1) A table that maps all uppercase letters to the corresponding +lowercase ones would work just as well for this purpose. + + +File: regex.info, Node: Using Registers, Next: Freeing GNU Pattern Buffers, Prev: GNU Translate Tables, Up: GNU Regex Functions + +Using Registers +--------------- + + A group in a regular expression can match a (posssibly empty) +substring of the string that regular expression as a whole matched. +The matcher remembers the beginning and end of the substring matched by +each group. + + To find out what they matched, pass a nonzero REGS argument to a GNU +matching or searching function (*note GNU Matching::. and *Note GNU +Searching::), i.e., the address of a structure of this type, as defined +in `regex.h': + + struct re_registers + { + unsigned num_regs; + regoff_t *start; + regoff_t *end; + }; + + Except for (possibly) the NUM_REGS'th element (see below), the Ith +element of the `start' and `end' arrays records information about the +Ith group in the pattern. (They're declared as C pointers, but this is +only because not all C compilers accept zero-length arrays; +conceptually, it is simplest to think of them as arrays.) + + The `start' and `end' arrays are allocated in various ways, depending +on the value of the `regs_allocated' field in the pattern buffer passed +to the matcher. + + The simplest and perhaps most useful is to let the matcher +(re)allocate enough space to record information for all the groups in +the regular expression. If `regs_allocated' is `REGS_UNALLOCATED', the +matcher allocates 1 + RE_NSUB (another field in the pattern buffer; +*note GNU Pattern Buffers::.). The extra element is set to -1, and +sets `regs_allocated' to `REGS_REALLOCATE'. Then on subsequent calls +with the same pattern buffer and REGS arguments, the matcher +reallocates more space if necessary. + + It would perhaps be more logical to make the `regs_allocated' field +part of the `re_registers' structure, instead of part of the pattern +buffer. But in that case the caller would be forced to initialize the +structure before passing it. Much existing code doesn't do this +initialization, and it's arguably better to avoid it anyway. + + `re_compile_pattern' sets `regs_allocated' to `REGS_UNALLOCATED', so +if you use the GNU regular expression functions, you get this behavior +by default. + + xx document re_set_registers + + POSIX, on the other hand, requires a different interface: the caller +is supposed to pass in a fixed-length array which the matcher fills. +Therefore, if `regs_allocated' is `REGS_FIXED' the matcher simply fills +that array. + + The following examples illustrate the information recorded in the +`re_registers' structure. (In all of them, `(' represents the +open-group and `)' the close-group operator. The first character in +the string STRING is at index 0.) + + * If the regular expression has an I-th group not contained within + another group that matches a substring of STRING, then the + function sets `REGS->start[I]' to the index in STRING where the + substring matched by the I-th group begins, and `REGS->end[I]' to + the index just beyond that substring's end. The function sets + `REGS->start[0]' and `REGS->end[0]' to analogous information about + the entire pattern. + + For example, when you match `((a)(b))' against `ab', you get: + + * 0 in `REGS->start[0]' and 2 in `REGS->end[0]' + + * 0 in `REGS->start[1]' and 2 in `REGS->end[1]' + + * 0 in `REGS->start[2]' and 1 in `REGS->end[2]' + + * 1 in `REGS->start[3]' and 2 in `REGS->end[3]' + + * If a group matches more than once (as it might if followed by, + e.g., a repetition operator), then the function reports the + information about what the group *last* matched. + + For example, when you match the pattern `(a)*' against the string + `aa', you get: + + * 0 in `REGS->start[0]' and 2 in `REGS->end[0]' + + * 1 in `REGS->start[1]' and 2 in `REGS->end[1]' + + * If the I-th group does not participate in a successful match, + e.g., it is an alternative not taken or a repetition operator + allows zero repetitions of it, then the function sets + `REGS->start[I]' and `REGS->end[I]' to -1. + + For example, when you match the pattern `(a)*b' against the string + `b', you get: + + * 0 in `REGS->start[0]' and 1 in `REGS->end[0]' + + * -1 in `REGS->start[1]' and -1 in `REGS->end[1]' + + * If the I-th group matches a zero-length string, then the function + sets `REGS->start[I]' and `REGS->end[I]' to the index just beyond + that zero-length string. + + For example, when you match the pattern `(a*)b' against the string + `b', you get: + + * 0 in `REGS->start[0]' and 1 in `REGS->end[0]' + + * 0 in `REGS->start[1]' and 0 in `REGS->end[1]' + + * If an I-th group contains a J-th group in turn not contained + within any other group within group I and the function reports a + match of the I-th group, then it records in `REGS->start[J]' and + `REGS->end[J]' the last match (if it matched) of the J-th group. + + For example, when you match the pattern `((a*)b)*' against the + string `abb', group 2 last matches the empty string, so you get + what it previously matched: + + * 0 in `REGS->start[0]' and 3 in `REGS->end[0]' + + * 2 in `REGS->start[1]' and 3 in `REGS->end[1]' + + * 2 in `REGS->start[2]' and 2 in `REGS->end[2]' + + When you match the pattern `((a)*b)*' against the string `abb', + group 2 doesn't participate in the last match, so you get: + + * 0 in `REGS->start[0]' and 3 in `REGS->end[0]' + + * 2 in `REGS->start[1]' and 3 in `REGS->end[1]' + + * 0 in `REGS->start[2]' and 1 in `REGS->end[2]' + + * If an I-th group contains a J-th group in turn not contained + within any other group within group I and the function sets + `REGS->start[I]' and `REGS->end[I]' to -1, then it also sets + `REGS->start[J]' and `REGS->end[J]' to -1. + + For example, when you match the pattern `((a)*b)*c' against the + string `c', you get: + + * 0 in `REGS->start[0]' and 1 in `REGS->end[0]' + + * -1 in `REGS->start[1]' and -1 in `REGS->end[1]' + + * -1 in `REGS->start[2]' and -1 in `REGS->end[2]' + + +File: regex.info, Node: Freeing GNU Pattern Buffers, Prev: Using Registers, Up: GNU Regex Functions + +Freeing GNU Pattern Buffers +--------------------------- + + To free any allocated fields of a pattern buffer, you can use the +POSIX function described in *Note Freeing POSIX Pattern Buffers::, +since the type `regex_t'--the type for POSIX pattern buffers--is +equivalent to the type `re_pattern_buffer'. After freeing a pattern +buffer, you need to again compile a regular expression in it (*note GNU +Regular Expression Compiling::.) before passing it to a matching or +searching function. + + +File: regex.info, Node: POSIX Regex Functions, Next: BSD Regex Functions, Prev: GNU Regex Functions, Up: Programming with Regex + +POSIX Regex Functions +===================== + + If you're writing code that has to be POSIX compatible, you'll need +to use these functions. Their interfaces are as specified by POSIX, +draft 1003.2/D11.2. + +* Menu: + +* POSIX Pattern Buffers:: The regex_t type. +* POSIX Regular Expression Compiling:: regcomp () +* POSIX Matching:: regexec () +* Reporting Errors:: regerror () +* Using Byte Offsets:: The regmatch_t type. +* Freeing POSIX Pattern Buffers:: regfree () + + +File: regex.info, Node: POSIX Pattern Buffers, Next: POSIX Regular Expression Compiling, Up: POSIX Regex Functions + +POSIX Pattern Buffers +--------------------- + + To compile or match a given regular expression the POSIX way, you +must supply a pattern buffer exactly the way you do for GNU (*note GNU +Pattern Buffers::.). POSIX pattern buffers have type `regex_t', which +is equivalent to the GNU pattern buffer type `re_pattern_buffer'. + + +File: regex.info, Node: POSIX Regular Expression Compiling, Next: POSIX Matching, Prev: POSIX Pattern Buffers, Up: POSIX Regex Functions + +POSIX Regular Expression Compiling +---------------------------------- + + With POSIX, you can only search for a given regular expression; you +can't match it. To do this, you must first compile it in a pattern +buffer, using `regcomp'. + + To compile a pattern buffer, use: + + int + regcomp (regex_t *PREG, const char *REGEX, int CFLAGS) + +PREG is the initialized pattern buffer's address, REGEX is the regular +expression's address, and CFLAGS is the compilation flags, which Regex +considers as a collection of bits. Here are the valid bits, as defined +in `regex.h': + +`REG_EXTENDED' + says to use POSIX Extended Regular Expression syntax; if this isn't + set, then says to use POSIX Basic Regular Expression syntax. + `regcomp' sets PREG's `syntax' field accordingly. + +`REG_ICASE' + says to ignore case; `regcomp' sets PREG's `translate' field to a + translate table which ignores case, replacing anything you've put + there before. + +`REG_NOSUB' + says to set PREG's `no_sub' field; *note POSIX Matching::., for + what this means. + +`REG_NEWLINE' + says that a: + + * match-any-character operator (*note Match-any-character + Operator::.) doesn't match a newline. + + * nonmatching list not containing a newline (*note List + Operators::.) matches a newline. + + * match-beginning-of-line operator (*note + Match-beginning-of-line Operator::.) matches the empty string + immediately after a newline, regardless of how `REG_NOTBOL' + is set (*note POSIX Matching::., for an explanation of + `REG_NOTBOL'). + + * match-end-of-line operator (*note Match-beginning-of-line + Operator::.) matches the empty string immediately before a + newline, regardless of how `REG_NOTEOL' is set (*note POSIX + Matching::., for an explanation of `REG_NOTEOL'). + + If `regcomp' successfully compiles the regular expression, it returns +zero and sets `*PATTERN_BUFFER' to the compiled pattern. Except for +`syntax' (which it sets as explained above), it also sets the same +fields the same way as does the GNU compiling function (*note GNU +Regular Expression Compiling::.). + + If `regcomp' can't compile the regular expression, it returns one of +the error codes listed here. (Except when noted differently, the +syntax of in all examples below is basic regular expression syntax.) + +`REG_BADRPT' + For example, the consecutive repetition operators `**' in `a**' + are invalid. As another example, if the syntax is extended + regular expression syntax, then the repetition operator `*' with + nothing on which to operate in `*' is invalid. + +`REG_BADBR' + For example, the COUNT `-1' in `a\{-1' is invalid. + +`REG_EBRACE' + For example, `a\{1' is missing a close-interval operator. + +`REG_EBRACK' + For example, `[a' is missing a close-list operator. + +`REG_ERANGE' + For example, the range ending point `z' that collates lower than + does its starting point `a' in `[z-a]' is invalid. Also, the + range with the character class `[:alpha:]' as its starting point in + `[[:alpha:]-|]'. + +`REG_ECTYPE' + For example, the character class name `foo' in `[[:foo:]' is + invalid. + +`REG_EPAREN' + For example, `a\)' is missing an open-group operator and `\(a' is + missing a close-group operator. + +`REG_ESUBREG' + For example, the back reference `\2' that refers to a nonexistent + subexpression in `\(a\)\2' is invalid. + +`REG_EEND' + Returned when a regular expression causes no other more specific + error. + +`REG_EESCAPE' + For example, the trailing backslash `\' in `a\' is invalid, as is + the one in `\'. + +`REG_BADPAT' + For example, in the extended regular expression syntax, the empty + group `()' in `a()b' is invalid. + +`REG_ESIZE' + Returned when a regular expression needs a pattern buffer larger + than 65536 bytes. + +`REG_ESPACE' + Returned when a regular expression makes Regex to run out of + memory. + + +File: regex.info, Node: POSIX Matching, Next: Reporting Errors, Prev: POSIX Regular Expression Compiling, Up: POSIX Regex Functions + +POSIX Matching +-------------- + + Matching the POSIX way means trying to match a null-terminated string +starting at its first character. Once you've compiled a pattern into a +pattern buffer (*note POSIX Regular Expression Compiling::.), you can +ask the matcher to match that pattern against a string using: + + int + regexec (const regex_t *PREG, const char *STRING, + size_t NMATCH, regmatch_t PMATCH[], int EFLAGS) + +PREG is the address of a pattern buffer for a compiled pattern. STRING +is the string you want to match. + + *Note Using Byte Offsets::, for an explanation of PMATCH. If you +pass zero for NMATCH or you compiled PREG with the compilation flag +`REG_NOSUB' set, then `regexec' will ignore PMATCH; otherwise, you must +allocate it to have at least NMATCH elements. `regexec' will record +NMATCH byte offsets in PMATCH, and set to -1 any unused elements up to +PMATCH`[NMATCH]' - 1. + + EFLAGS specifies "execution flags"--namely, the two bits `REG_NOTBOL' +and `REG_NOTEOL' (defined in `regex.h'). If you set `REG_NOTBOL', then +the match-beginning-of-line operator (*note Match-beginning-of-line +Operator::.) always fails to match. This lets you match against pieces +of a line, as you would need to if, say, searching for repeated +instances of a given pattern in a line; it would work correctly for +patterns both with and without match-beginning-of-line operators. +`REG_NOTEOL' works analogously for the match-end-of-line operator +(*note Match-end-of-line Operator::.); it exists for symmetry. + + `regexec' tries to find a match for PREG in STRING according to the +syntax in PREG's `syntax' field. (*Note POSIX Regular Expression +Compiling::, for how to set it.) The function returns zero if the +compiled pattern matches STRING and `REG_NOMATCH' (defined in +`regex.h') if it doesn't. + + +File: regex.info, Node: Reporting Errors, Next: Using Byte Offsets, Prev: POSIX Matching, Up: POSIX Regex Functions + +Reporting Errors +---------------- + + If either `regcomp' or `regexec' fail, they return a nonzero error +code, the possibilities for which are defined in `regex.h'. *Note +POSIX Regular Expression Compiling::, and *Note POSIX Matching::, for +what these codes mean. To get an error string corresponding to these +codes, you can use: + + size_t + regerror (int ERRCODE, + const regex_t *PREG, + char *ERRBUF, + size_t ERRBUF_SIZE) + +ERRCODE is an error code, PREG is the address of the pattern buffer +which provoked the error, ERRBUF is the error buffer, and ERRBUF_SIZE +is ERRBUF's size. + + `regerror' returns the size in bytes of the error string +corresponding to ERRCODE (including its terminating null). If ERRBUF +and ERRBUF_SIZE are nonzero, it also returns in ERRBUF the first +ERRBUF_SIZE - 1 characters of the error string, followed by a null. +eRRBUF_SIZE must be a nonnegative number less than or equal to the size +in bytes of ERRBUF. + + You can call `regerror' with a null ERRBUF and a zero ERRBUF_SIZE to +determine how large ERRBUF need be to accommodate `regerror''s error +string. + + +File: regex.info, Node: Using Byte Offsets, Next: Freeing POSIX Pattern Buffers, Prev: Reporting Errors, Up: POSIX Regex Functions + +Using Byte Offsets +------------------ + + In POSIX, variables of type `regmatch_t' hold analogous information, +but are not identical to, GNU's registers (*note Using Registers::.). +To get information about registers in POSIX, pass to `regexec' a +nonzero PMATCH of type `regmatch_t', i.e., the address of a structure +of this type, defined in `regex.h': + + typedef struct + { + regoff_t rm_so; + regoff_t rm_eo; + } regmatch_t; + + When reading in *Note Using Registers::, about how the matching +function stores the information into the registers, substitute PMATCH +for REGS, `PMATCH[I]->rm_so' for `REGS->start[I]' and +`PMATCH[I]->rm_eo' for `REGS->end[I]'. + + +File: regex.info, Node: Freeing POSIX Pattern Buffers, Prev: Using Byte Offsets, Up: POSIX Regex Functions + +Freeing POSIX Pattern Buffers +----------------------------- + + To free any allocated fields of a pattern buffer, use: + + void + regfree (regex_t *PREG) + +PREG is the pattern buffer whose allocated fields you want freed. +`regfree' also sets PREG's `allocated' and `used' fields to zero. +After freeing a pattern buffer, you need to again compile a regular +expression in it (*note POSIX Regular Expression Compiling::.) before +passing it to the matching function (*note POSIX Matching::.). + + +File: regex.info, Node: BSD Regex Functions, Prev: POSIX Regex Functions, Up: Programming with Regex + +BSD Regex Functions +=================== + + If you're writing code that has to be Berkeley UNIX compatible, +you'll need to use these functions whose interfaces are the same as +those in Berkeley UNIX. + +* Menu: + +* BSD Regular Expression Compiling:: re_comp () +* BSD Searching:: re_exec () + + +File: regex.info, Node: BSD Regular Expression Compiling, Next: BSD Searching, Up: BSD Regex Functions + +BSD Regular Expression Compiling +-------------------------------- + + With Berkeley UNIX, you can only search for a given regular +expression; you can't match one. To search for it, you must first +compile it. Before you compile it, you must indicate the regular +expression syntax you want it compiled according to by setting the +variable `re_syntax_options' (declared in `regex.h' to some syntax +(*note Regular Expression Syntax::.). + + To compile a regular expression use: + + char * + re_comp (char *REGEX) + +REGEX is the address of a null-terminated regular expression. +`re_comp' uses an internal pattern buffer, so you can use only the most +recently compiled pattern buffer. This means that if you want to use a +given regular expression that you've already compiled--but it isn't the +latest one you've compiled--you'll have to recompile it. If you call +`re_comp' with the null string (*not* the empty string) as the +argument, it doesn't change the contents of the pattern buffer. + + If `re_comp' successfully compiles the regular expression, it returns +zero. If it can't compile the regular expression, it returns an error +string. `re_comp''s error messages are identical to those of +`re_compile_pattern' (*note GNU Regular Expression Compiling::.). + + +File: regex.info, Node: BSD Searching, Prev: BSD Regular Expression Compiling, Up: BSD Regex Functions + +BSD Searching +------------- + + Searching the Berkeley UNIX way means searching in a string starting +at its first character and trying successive positions within it to +find a match. Once you've compiled a pattern using `re_comp' (*note +BSD Regular Expression Compiling::.), you can ask Regex to search for +that pattern in a string using: + + int + re_exec (char *STRING) + +STRING is the address of the null-terminated string in which you want +to search. + + `re_exec' returns either 1 for success or 0 for failure. It +automatically uses a GNU fastmap (*note Searching with Fastmaps::.). + + +File: regex.info, Node: Copying, Next: Index, Prev: Programming with Regex, Up: Top + +GNU GENERAL PUBLIC LICENSE +************************** + + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 675 Mass Ave, Cambridge, MA 02139, USA + + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +Preamble +======== + + The licenses for most software are designed to take away your freedom +to share and change it. By contrast, the GNU General Public License is +intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it in +new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 1. This License applies to any program or other work which contains a + notice placed by the copyright holder saying it may be distributed + under the terms of this General Public License. The "Program", + below, refers to any such program or work, and a "work based on + the Program" means either the Program or any derivative work under + copyright law: that is to say, a work containing the Program or a + portion of it, either verbatim or with modifications and/or + translated into another language. (Hereinafter, translation is + included without limitation in the term "modification".) Each + licensee is addressed as "you". + + Activities other than copying, distribution and modification are + not covered by this License; they are outside its scope. The act + of running the Program is not restricted, and the output from the + Program is covered only if its contents constitute a work based on + the Program (independent of having been made by running the + Program). Whether that is true depends on what the Program does. + + 2. You may copy and distribute verbatim copies of the Program's + source code as you receive it, in any medium, provided that you + conspicuously and appropriately publish on each copy an appropriate + copyright notice and disclaimer of warranty; keep intact all the + notices that refer to this License and to the absence of any + warranty; and give any other recipients of the Program a copy of + this License along with the Program. + + You may charge a fee for the physical act of transferring a copy, + and you may at your option offer warranty protection in exchange + for a fee. + + 3. You may modify your copy or copies of the Program or any portion + of it, thus forming a work based on the Program, and copy and + distribute such modifications or work under the terms of Section 1 + above, provided that you also meet all of these conditions: + + a. You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b. You must cause any work that you distribute or publish, that + in whole or in part contains or is derived from the Program + or any part thereof, to be licensed as a whole at no charge + to all third parties under the terms of this License. + + c. If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display + an announcement including an appropriate copyright notice and + a notice that there is no warranty (or else, saying that you + provide a warranty) and that users may redistribute the + program under these conditions, and telling the user how to + view a copy of this License. (Exception: if the Program + itself is interactive but does not normally print such an + announcement, your work based on the Program is not required + to print an announcement.) + + These requirements apply to the modified work as a whole. If + identifiable sections of that work are not derived from the + Program, and can be reasonably considered independent and separate + works in themselves, then this License, and its terms, do not + apply to those sections when you distribute them as separate + works. But when you distribute the same sections as part of a + whole which is a work based on the Program, the distribution of + the whole must be on the terms of this License, whose permissions + for other licensees extend to the entire whole, and thus to each + and every part regardless of who wrote it. + + Thus, it is not the intent of this section to claim rights or + contest your rights to work written entirely by you; rather, the + intent is to exercise the right to control the distribution of + derivative or collective works based on the Program. + + In addition, mere aggregation of another work not based on the + Program with the Program (or with a work based on the Program) on + a volume of a storage or distribution medium does not bring the + other work under the scope of this License. + + 4. You may copy and distribute the Program (or a work based on it, + under Section 2) in object code or executable form under the terms + of Sections 1 and 2 above provided that you also do one of the + following: + + a. Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of + Sections 1 and 2 above on a medium customarily used for + software interchange; or, + + b. Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a + medium customarily used for software interchange; or, + + c. Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with + such an offer, in accord with Subsection b above.) + + The source code for a work means the preferred form of the work for + making modifications to it. For an executable work, complete + source code means all the source code for all modules it contains, + plus any associated interface definition files, plus the scripts + used to control compilation and installation of the executable. + However, as a special exception, the source code distributed need + not include anything that is normally distributed (in either + source or binary form) with the major components (compiler, + kernel, and so on) of the operating system on which the executable + runs, unless that component itself accompanies the executable. + + If distribution of executable or object code is made by offering + access to copy from a designated place, then offering equivalent + access to copy the source code from the same place counts as + distribution of the source code, even though third parties are not + compelled to copy the source along with the object code. + + 5. You may not copy, modify, sublicense, or distribute the Program + except as expressly provided under this License. Any attempt + otherwise to copy, modify, sublicense or distribute the Program is + void, and will automatically terminate your rights under this + License. However, parties who have received copies, or rights, + from you under this License will not have their licenses + terminated so long as such parties remain in full compliance. + + 6. You are not required to accept this License, since you have not + signed it. However, nothing else grants you permission to modify + or distribute the Program or its derivative works. These actions + are prohibited by law if you do not accept this License. + Therefore, by modifying or distributing the Program (or any work + based on the Program), you indicate your acceptance of this + License to do so, and all its terms and conditions for copying, + distributing or modifying the Program or works based on it. + + 7. Each time you redistribute the Program (or any work based on the + Program), the recipient automatically receives a license from the + original licensor to copy, distribute or modify the Program + subject to these terms and conditions. You may not impose any + further restrictions on the recipients' exercise of the rights + granted herein. You are not responsible for enforcing compliance + by third parties to this License. + + 8. If, as a consequence of a court judgment or allegation of patent + infringement or for any other reason (not limited to patent + issues), conditions are imposed on you (whether by court order, + agreement or otherwise) that contradict the conditions of this + License, they do not excuse you from the conditions of this + License. If you cannot distribute so as to satisfy simultaneously + your obligations under this License and any other pertinent + obligations, then as a consequence you may not distribute the + Program at all. For example, if a patent license would not permit + royalty-free redistribution of the Program by all those who + receive copies directly or indirectly through you, then the only + way you could satisfy both it and this License would be to refrain + entirely from distribution of the Program. + + If any portion of this section is held invalid or unenforceable + under any particular circumstance, the balance of the section is + intended to apply and the section as a whole is intended to apply + in other circumstances. + + It is not the purpose of this section to induce you to infringe any + patents or other property right claims or to contest validity of + any such claims; this section has the sole purpose of protecting + the integrity of the free software distribution system, which is + implemented by public license practices. Many people have made + generous contributions to the wide range of software distributed + through that system in reliance on consistent application of that + system; it is up to the author/donor to decide if he or she is + willing to distribute software through any other system and a + licensee cannot impose that choice. + + This section is intended to make thoroughly clear what is believed + to be a consequence of the rest of this License. + + 9. If the distribution and/or use of the Program is restricted in + certain countries either by patents or by copyrighted interfaces, + the original copyright holder who places the Program under this + License may add an explicit geographical distribution limitation + excluding those countries, so that distribution is permitted only + in or among countries not thus excluded. In such case, this + License incorporates the limitation as if written in the body of + this License. + + 10. The Free Software Foundation may publish revised and/or new + versions of the General Public License from time to time. Such + new versions will be similar in spirit to the present version, but + may differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the + Program specifies a version number of this License which applies + to it and "any later version", you have the option of following + the terms and conditions either of that version or of any later + version published by the Free Software Foundation. If the Program + does not specify a version number of this License, you may choose + any version ever published by the Free Software Foundation. + + 11. If you wish to incorporate parts of the Program into other free + programs whose distribution conditions are different, write to the + author to ask for permission. For software which is copyrighted + by the Free Software Foundation, write to the Free Software + Foundation; we sometimes make exceptions for this. Our decision + will be guided by the two goals of preserving the free status of + all derivatives of our free software and of promoting the sharing + and reuse of software generally. + + NO WARRANTY + + 12. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO + WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE + LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT + HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT + WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT + NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE + QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE + PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY + SERVICING, REPAIR OR CORRECTION. + + 13. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN + WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY + MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE + LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, + INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR + INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF + DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU + OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY + OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + +Appendix: How to Apply These Terms to Your New Programs +======================================================= + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these +terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + ONE LINE TO GIVE THE PROGRAM'S NAME AND A BRIEF IDEA OF WHAT IT DOES. + Copyright (C) 19YY NAME OF AUTHOR + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Also add information on how to contact you by electronic and paper +mail. + + If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19YY NAME OF AUTHOR + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + + The hypothetical commands `show w' and `show c' should show the +appropriate parts of the General Public License. Of course, the +commands you use may be called something other than `show w' and `show +c'; they could even be mouse-clicks or menu items--whatever suits your +program. + + You should also get your employer (if you work as a programmer) or +your school, if any, to sign a "copyright disclaimer" for the program, +if necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + SIGNATURE OF TY COON, 1 April 1989 + Ty Coon, President of Vice + + This General Public License does not permit incorporating your +program into proprietary programs. If your program is a subroutine +library, you may consider it more useful to permit linking proprietary +applications with the library. If this is what you want to do, use the +GNU Library General Public License instead of this License. + + +File: regex.info, Node: Index, Prev: Copying, Up: Top + +Index +***** + +* Menu: + +* $: Match-end-of-line Operator. +* (: Grouping Operators. +* ): Grouping Operators. +* *: Match-zero-or-more Operator. +* +: Match-one-or-more Operator. +* -: List Operators. +* .: Match-any-character Operator. +* :] in regex: Character Class Operators. +* ?: Match-zero-or-one Operator. +* {: Interval Operators. +* }: Interval Operators. +* [: in regex: Character Class Operators. +* [^: List Operators. +* [: List Operators. +* \': Match-end-of-buffer Operator. +* \<: Match-beginning-of-word Operator. +* \>: Match-end-of-word Operator. +* \{: Interval Operators. +* \}: Interval Operators. +* \b: Match-word-boundary Operator. +* \B: Match-within-word Operator. +* \s: Match-syntactic-class Operator. +* \S: Match-not-syntactic-class Operator. +* \w: Match-word-constituent Operator. +* \W: Match-non-word-constituent Operator. +* \`: Match-beginning-of-buffer Operator. +* \: List Operators. +* ]: List Operators. +* ^: List Operators. +* allocated initialization: GNU Regular Expression Compiling. +* alternation operator: Alternation Operator. +* alternation operator and ^: Match-beginning-of-line Operator. +* anchoring: Anchoring Operators. +* anchors: Match-end-of-line Operator. +* anchors: Match-beginning-of-line Operator. +* Awk: Predefined Syntaxes. +* back references: Back-reference Operator. +* backtracking: Match-zero-or-more Operator. +* backtracking: Alternation Operator. +* beginning-of-line operator: Match-beginning-of-line Operator. +* bracket expression: List Operators. +* buffer field, set by re_compile_pattern: GNU Regular Expression Compiling. +* buffer initialization: GNU Regular Expression Compiling. +* character classes: Character Class Operators. +* Egrep: Predefined Syntaxes. +* Emacs: Predefined Syntaxes. +* end in struct re_registers: Using Registers. +* end-of-line operator: Match-end-of-line Operator. +* fastmap initialization: GNU Regular Expression Compiling. +* fastmaps: Searching with Fastmaps. +* fastmap_accurate field, set by re_compile_pattern: GNU Regular Expression Compiling. +* Grep: Predefined Syntaxes. +* grouping: Grouping Operators. +* ignoring case: POSIX Regular Expression Compiling. +* interval expression: Interval Operators. +* matching list: List Operators. +* matching newline: List Operators. +* matching with GNU functions: GNU Matching. +* newline_anchor field in pattern buffer: Match-beginning-of-line Operator. +* nonmatching list: List Operators. +* not_bol field in pattern buffer: Match-beginning-of-line Operator. +* num_regs in struct re_registers: Using Registers. +* open-group operator and ^: Match-beginning-of-line Operator. +* or operator: Alternation Operator. +* parenthesizing: Grouping Operators. +* pattern buffer initialization: GNU Regular Expression Compiling. +* pattern buffer, definition of: GNU Pattern Buffers. +* POSIX Awk: Predefined Syntaxes. +* range argument to re_search: GNU Searching. +* regex.c: Overview. +* regex.h: Overview. +* regexp anchoring: Anchoring Operators. +* regmatch_t: Using Byte Offsets. +* regs_allocated: Using Registers. +* REGS_FIXED: Using Registers. +* REGS_REALLOCATE: Using Registers. +* REGS_UNALLOCATED: Using Registers. +* regular expressions, syntax of: Regular Expression Syntax. +* REG_EXTENDED: POSIX Regular Expression Compiling. +* REG_ICASE: POSIX Regular Expression Compiling. +* REG_NEWLINE: POSIX Regular Expression Compiling. +* REG_NOSUB: POSIX Regular Expression Compiling. +* RE_BACKSLASH_ESCAPE_IN_LIST: Syntax Bits. +* RE_BK_PLUS_QM: Syntax Bits. +* RE_CHAR_CLASSES: Syntax Bits. +* RE_CONTEXT_INDEP_ANCHORS: Syntax Bits. +* RE_CONTEXT_INDEP_ANCHORS (and ^): Match-beginning-of-line Operator. +* RE_CONTEXT_INDEP_OPS: Syntax Bits. +* RE_CONTEXT_INVALID_OPS: Syntax Bits. +* RE_DOT_NEWLINE: Syntax Bits. +* RE_DOT_NOT_NULL: Syntax Bits. +* RE_INTERVALS: Syntax Bits. +* RE_LIMITED_OPS: Syntax Bits. +* RE_NEWLINE_ALT: Syntax Bits. +* RE_NO_BK_BRACES: Syntax Bits. +* RE_NO_BK_PARENS: Syntax Bits. +* RE_NO_BK_REFS: Syntax Bits. +* RE_NO_BK_VBAR: Syntax Bits. +* RE_NO_EMPTY_RANGES: Syntax Bits. +* re_nsub field, set by re_compile_pattern: GNU Regular Expression Compiling. +* re_pattern_buffer definition: GNU Pattern Buffers. +* re_registers: Using Registers. +* re_syntax_options initialization: GNU Regular Expression Compiling. +* RE_UNMATCHED_RIGHT_PAREN_ORD: Syntax Bits. +* searching with GNU functions: GNU Searching. +* start argument to re_search: GNU Searching. +* start in struct re_registers: Using Registers. +* struct re_pattern_buffer definition: GNU Pattern Buffers. +* subexpressions: Grouping Operators. +* syntax field, set by re_compile_pattern: GNU Regular Expression Compiling. +* syntax bits: Syntax Bits. +* syntax initialization: GNU Regular Expression Compiling. +* syntax of regular expressions: Regular Expression Syntax. +* translate initialization: GNU Regular Expression Compiling. +* used field, set by re_compile_pattern: GNU Regular Expression Compiling. +* word boundaries, matching: Match-word-boundary Operator. +* \: The Backslash Character. +* \(: Grouping Operators. +* \): Grouping Operators. +* \|: Alternation Operator. +* ^: Match-beginning-of-line Operator. +* |: Alternation Operator. + + + +Tag Table: +Node: Top1064 +Node: Overview4562 +Node: Regular Expression Syntax6746 +Node: Syntax Bits7916 +Node: Predefined Syntaxes14018 +Node: Collating Elements vs. Characters17872 +Node: The Backslash Character18835 +Node: Common Operators21992 +Node: Match-self Operator23445 +Node: Match-any-character Operator23941 +Node: Concatenation Operator24520 +Node: Repetition Operators25017 +Node: Match-zero-or-more Operator25436 +Node: Match-one-or-more Operator27483 +Node: Match-zero-or-one Operator28341 +Node: Interval Operators29196 +Node: Alternation Operator30991 +Node: List Operators32489 +Node: Character Class Operators35272 +Node: Range Operator36901 +Node: Grouping Operators38930 +Node: Back-reference Operator40251 +Node: Anchoring Operators43073 +Node: Match-beginning-of-line Operator43447 +Node: Match-end-of-line Operator44779 +Node: GNU Operators45518 +Node: Word Operators45767 +Node: Non-Emacs Syntax Tables46391 +Node: Match-word-boundary Operator47465 +Node: Match-within-word Operator47858 +Node: Match-beginning-of-word Operator48255 +Node: Match-end-of-word Operator48588 +Node: Match-word-constituent Operator48908 +Node: Match-non-word-constituent Operator49234 +Node: Buffer Operators49545 +Node: Match-beginning-of-buffer Operator49952 +Node: Match-end-of-buffer Operator50264 +Node: GNU Emacs Operators50558 +Node: Syntactic Class Operators50901 +Node: Emacs Syntax Tables51307 +Node: Match-syntactic-class Operator51963 +Node: Match-not-syntactic-class Operator52560 +Node: What Gets Matched?53150 +Node: Programming with Regex53799 +Node: GNU Regex Functions54237 +Node: GNU Pattern Buffers55078 +Node: GNU Regular Expression Compiling58303 +Node: GNU Matching61181 +Node: GNU Searching63101 +Node: Matching/Searching with Split Data64913 +Node: Searching with Fastmaps66369 +Node: GNU Translate Tables68921 +Node: Using Registers70892 +Node: Freeing GNU Pattern Buffers77000 +Node: POSIX Regex Functions77593 +Node: POSIX Pattern Buffers78266 +Node: POSIX Regular Expression Compiling78709 +Node: POSIX Matching82836 +Node: Reporting Errors84791 +Node: Using Byte Offsets86048 +Node: Freeing POSIX Pattern Buffers86861 +Node: BSD Regex Functions87467 +Node: BSD Regular Expression Compiling87886 +Node: BSD Searching89258 +Node: Copying89960 +Node: Index109122 + +End Tag Table diff --git a/regex-0.12/doc/regex.texi b/regex-0.12/doc/regex.texi @@ -0,0 +1,3138 @@ +\input texinfo +@c %**start of header +@setfilename regex.info +@settitle Regex +@c %**end of header + +@c \\{fill-paragraph} works better (for me, anyway) if the text in the +@c source file isn't indented. +@paragraphindent 2 + +@c Define a new index for our magic constants. +@defcodeindex cn + +@c Put everything in one index (arbitrarily chosen to be the concept index). +@syncodeindex cn cp +@syncodeindex ky cp +@syncodeindex pg cp +@syncodeindex tp cp +@syncodeindex vr cp + +@c Here is what we use in the Info `dir' file: +@c * Regex: (regex). Regular expression library. + + +@ifinfo +This file documents the GNU regular expression library. + +Copyright (C) 1992, 1993 Free Software Foundation, Inc. + +Permission is granted to make and distribute verbatim copies of this +manual provided the copyright notice and this permission notice are +preserved on all copies. + +@ignore +Permission is granted to process this file through TeX and print the +results, provided the printed document carries a copying permission +notice identical to this one except for the removal of this paragraph +(this paragraph not being relevant to the printed manual). +@end ignore + +Permission is granted to copy and distribute modified versions of this +manual under the conditions for verbatim copying, provided also that the +section entitled ``GNU General Public License'' is included exactly as +in the original, and provided that the entire resulting derived work is +distributed under the terms of a permission notice identical to this one. + +Permission is granted to copy and distribute translations of this manual +into another language, under the above conditions for modified versions, +except that the section entitled ``GNU General Public License'' may be +included in a translation approved by the Free Software Foundation +instead of in the original English. +@end ifinfo + + +@titlepage + +@title Regex +@subtitle edition 0.12a +@subtitle 19 September 1992 +@author Kathryn A. Hargreaves +@author Karl Berry + +@page + +@vskip 0pt plus 1filll +Copyright @copyright{} 1992 Free Software Foundation. + +Permission is granted to make and distribute verbatim copies of this +manual provided the copyright notice and this permission notice are +preserved on all copies. + +Permission is granted to copy and distribute modified versions of this +manual under the conditions for verbatim copying, provided also that the +section entitled ``GNU General Public License'' is included exactly as +in the original, and provided that the entire resulting derived work is +distributed under the terms of a permission notice identical to this +one. + +Permission is granted to copy and distribute translations of this manual +into another language, under the above conditions for modified versions, +except that the section entitled ``GNU General Public License'' may be +included in a translation approved by the Free Software Foundation +instead of in the original English. + +@end titlepage + + +@ifinfo +@node Top, Overview, (dir), (dir) +@top Regular Expression Library + +This manual documents how to program with the GNU regular expression +library. This is edition 0.12a of the manual, 19 September 1992. + +The first part of this master menu lists the major nodes in this Info +document, including the index. The rest of the menu lists all the +lower level nodes in the document. + +@menu +* Overview:: +* Regular Expression Syntax:: +* Common Operators:: +* GNU Operators:: +* GNU Emacs Operators:: +* What Gets Matched?:: +* Programming with Regex:: +* Copying:: Copying and sharing Regex. +* Index:: General index. + --- The Detailed Node Listing --- + +Regular Expression Syntax + +* Syntax Bits:: +* Predefined Syntaxes:: +* Collating Elements vs. Characters:: +* The Backslash Character:: + +Common Operators + +* Match-self Operator:: Ordinary characters. +* Match-any-character Operator:: . +* Concatenation Operator:: Juxtaposition. +* Repetition Operators:: * + ? @{@} +* Alternation Operator:: | +* List Operators:: [...] [^...] +* Grouping Operators:: (...) +* Back-reference Operator:: \digit +* Anchoring Operators:: ^ $ + +Repetition Operators + +* Match-zero-or-more Operator:: * +* Match-one-or-more Operator:: + +* Match-zero-or-one Operator:: ? +* Interval Operators:: @{@} + +List Operators (@code{[} @dots{} @code{]} and @code{[^} @dots{} @code{]}) + +* Character Class Operators:: [:class:] +* Range Operator:: start-end + +Anchoring Operators + +* Match-beginning-of-line Operator:: ^ +* Match-end-of-line Operator:: $ + +GNU Operators + +* Word Operators:: +* Buffer Operators:: + +Word Operators + +* Non-Emacs Syntax Tables:: +* Match-word-boundary Operator:: \b +* Match-within-word Operator:: \B +* Match-beginning-of-word Operator:: \< +* Match-end-of-word Operator:: \> +* Match-word-constituent Operator:: \w +* Match-non-word-constituent Operator:: \W + +Buffer Operators + +* Match-beginning-of-buffer Operator:: \` +* Match-end-of-buffer Operator:: \' + +GNU Emacs Operators + +* Syntactic Class Operators:: + +Syntactic Class Operators + +* Emacs Syntax Tables:: +* Match-syntactic-class Operator:: \sCLASS +* Match-not-syntactic-class Operator:: \SCLASS + +Programming with Regex + +* GNU Regex Functions:: +* POSIX Regex Functions:: +* BSD Regex Functions:: + +GNU Regex Functions + +* GNU Pattern Buffers:: The re_pattern_buffer type. +* GNU Regular Expression Compiling:: re_compile_pattern () +* GNU Matching:: re_match () +* GNU Searching:: re_search () +* Matching/Searching with Split Data:: re_match_2 (), re_search_2 () +* Searching with Fastmaps:: re_compile_fastmap () +* GNU Translate Tables:: The `translate' field. +* Using Registers:: The re_registers type and related fns. +* Freeing GNU Pattern Buffers:: regfree () + +POSIX Regex Functions + +* POSIX Pattern Buffers:: The regex_t type. +* POSIX Regular Expression Compiling:: regcomp () +* POSIX Matching:: regexec () +* Reporting Errors:: regerror () +* Using Byte Offsets:: The regmatch_t type. +* Freeing POSIX Pattern Buffers:: regfree () + +BSD Regex Functions + +* BSD Regular Expression Compiling:: re_comp () +* BSD Searching:: re_exec () +@end menu +@end ifinfo +@node Overview, Regular Expression Syntax, Top, Top +@chapter Overview + +A @dfn{regular expression} (or @dfn{regexp}, or @dfn{pattern}) is a text +string that describes some (mathematical) set of strings. A regexp +@var{r} @dfn{matches} a string @var{s} if @var{s} is in the set of +strings described by @var{r}. + +Using the Regex library, you can: + +@itemize @bullet + +@item +see if a string matches a specified pattern as a whole, and + +@item +search within a string for a substring matching a specified pattern. + +@end itemize + +Some regular expressions match only one string, i.e., the set they +describe has only one member. For example, the regular expression +@samp{foo} matches the string @samp{foo} and no others. Other regular +expressions match more than one string, i.e., the set they describe has +more than one member. For example, the regular expression @samp{f*} +matches the set of strings made up of any number (including zero) of +@samp{f}s. As you can see, some characters in regular expressions match +themselves (such as @samp{f}) and some don't (such as @samp{*}); the +ones that don't match themselves instead let you specify patterns that +describe many different strings. + +To either match or search for a regular expression with the Regex +library functions, you must first compile it with a Regex pattern +compiling function. A @dfn{compiled pattern} is a regular expression +converted to the internal format used by the library functions. Once +you've compiled a pattern, you can use it for matching or searching any +number of times. + +The Regex library consists of two source files: @file{regex.h} and +@file{regex.c}. +@pindex regex.h +@pindex regex.c +Regex provides three groups of functions with which you can operate on +regular expressions. One group---the @sc{gnu} group---is more powerful +but not completely compatible with the other two, namely the @sc{posix} +and Berkeley @sc{unix} groups; its interface was designed specifically +for @sc{gnu}. The other groups have the same interfaces as do the +regular expression functions in @sc{posix} and Berkeley +@sc{unix}. + +We wrote this chapter with programmers in mind, not users of +programs---such as Emacs---that use Regex. We describe the Regex +library in its entirety, not how to write regular expressions that a +particular program understands. + + +@node Regular Expression Syntax, Common Operators, Overview, Top +@chapter Regular Expression Syntax + +@cindex regular expressions, syntax of +@cindex syntax of regular expressions + +@dfn{Characters} are things you can type. @dfn{Operators} are things in +a regular expression that match one or more characters. You compose +regular expressions from operators, which in turn you specify using one +or more characters. + +Most characters represent what we call the match-self operator, i.e., +they match themselves; we call these characters @dfn{ordinary}. Other +characters represent either all or parts of fancier operators; e.g., +@samp{.} represents what we call the match-any-character operator +(which, no surprise, matches (almost) any character); we call these +characters @dfn{special}. Two different things determine what +characters represent what operators: + +@enumerate +@item +the regular expression syntax your program has told the Regex library to +recognize, and + +@item +the context of the character in the regular expression. +@end enumerate + +In the following sections, we describe these things in more detail. + +@menu +* Syntax Bits:: +* Predefined Syntaxes:: +* Collating Elements vs. Characters:: +* The Backslash Character:: +@end menu + + +@node Syntax Bits, Predefined Syntaxes, , Regular Expression Syntax +@section Syntax Bits + +@cindex syntax bits + +In any particular syntax for regular expressions, some characters are +always special, others are sometimes special, and others are never +special. The particular syntax that Regex recognizes for a given +regular expression depends on the value in the @code{syntax} field of +the pattern buffer of that regular expression. + +You get a pattern buffer by compiling a regular expression. @xref{GNU +Pattern Buffers}, and @ref{POSIX Pattern Buffers}, for more information +on pattern buffers. @xref{GNU Regular Expression Compiling}, @ref{POSIX +Regular Expression Compiling}, and @ref{BSD Regular Expression +Compiling}, for more information on compiling. + +Regex considers the value of the @code{syntax} field to be a collection +of bits; we refer to these bits as @dfn{syntax bits}. In most cases, +they affect what characters represent what operators. We describe the +meanings of the operators to which we refer in @ref{Common Operators}, +@ref{GNU Operators}, and @ref{GNU Emacs Operators}. + +For reference, here is the complete list of syntax bits, in alphabetical +order: + +@table @code + +@cnindex RE_BACKSLASH_ESCAPE_IN_LIST +@item RE_BACKSLASH_ESCAPE_IN_LISTS +If this bit is set, then @samp{\} inside a list (@pxref{List Operators} +quotes (makes ordinary, if it's special) the following character; if +this bit isn't set, then @samp{\} is an ordinary character inside lists. +(@xref{The Backslash Character}, for what `\' does outside of lists.) + +@cnindex RE_BK_PLUS_QM +@item RE_BK_PLUS_QM +If this bit is set, then @samp{\+} represents the match-one-or-more +operator and @samp{\?} represents the match-zero-or-more operator; if +this bit isn't set, then @samp{+} represents the match-one-or-more +operator and @samp{?} represents the match-zero-or-one operator. This +bit is irrelevant if @code{RE_LIMITED_OPS} is set. + +@cnindex RE_CHAR_CLASSES +@item RE_CHAR_CLASSES +If this bit is set, then you can use character classes in lists; if this +bit isn't set, then you can't. + +@cnindex RE_CONTEXT_INDEP_ANCHORS +@item RE_CONTEXT_INDEP_ANCHORS +If this bit is set, then @samp{^} and @samp{$} are special anywhere outside +a list; if this bit isn't set, then these characters are special only in +certain contexts. @xref{Match-beginning-of-line Operator}, and +@ref{Match-end-of-line Operator}. + +@cnindex RE_CONTEXT_INDEP_OPS +@item RE_CONTEXT_INDEP_OPS +If this bit is set, then certain characters are special anywhere outside +a list; if this bit isn't set, then those characters are special only in +some contexts and are ordinary elsewhere. Specifically, if this bit +isn't set then @samp{*}, and (if the syntax bit @code{RE_LIMITED_OPS} +isn't set) @samp{+} and @samp{?} (or @samp{\+} and @samp{\?}, depending +on the syntax bit @code{RE_BK_PLUS_QM}) represent repetition operators +only if they're not first in a regular expression or just after an +open-group or alternation operator. The same holds for @samp{@{} (or +@samp{\@{}, depending on the syntax bit @code{RE_NO_BK_BRACES}) if +it is the beginning of a valid interval and the syntax bit +@code{RE_INTERVALS} is set. + +@cnindex RE_CONTEXT_INVALID_OPS +@item RE_CONTEXT_INVALID_OPS +If this bit is set, then repetition and alternation operators can't be +in certain positions within a regular expression. Specifically, the +regular expression is invalid if it has: + +@itemize @bullet + +@item +a repetition operator first in the regular expression or just after a +match-beginning-of-line, open-group, or alternation operator; or + +@item +an alternation operator first or last in the regular expression, just +before a match-end-of-line operator, or just after an alternation or +open-group operator. + +@end itemize + +If this bit isn't set, then you can put the characters representing the +repetition and alternation characters anywhere in a regular expression. +Whether or not they will in fact be operators in certain positions +depends on other syntax bits. + +@cnindex RE_DOT_NEWLINE +@item RE_DOT_NEWLINE +If this bit is set, then the match-any-character operator matches +a newline; if this bit isn't set, then it doesn't. + +@cnindex RE_DOT_NOT_NULL +@item RE_DOT_NOT_NULL +If this bit is set, then the match-any-character operator doesn't match +a null character; if this bit isn't set, then it does. + +@cnindex RE_INTERVALS +@item RE_INTERVALS +If this bit is set, then Regex recognizes interval operators; if this bit +isn't set, then it doesn't. + +@cnindex RE_LIMITED_OPS +@item RE_LIMITED_OPS +If this bit is set, then Regex doesn't recognize the match-one-or-more, +match-zero-or-one or alternation operators; if this bit isn't set, then +it does. + +@cnindex RE_NEWLINE_ALT +@item RE_NEWLINE_ALT +If this bit is set, then newline represents the alternation operator; if +this bit isn't set, then newline is ordinary. + +@cnindex RE_NO_BK_BRACES +@item RE_NO_BK_BRACES +If this bit is set, then @samp{@{} represents the open-interval operator +and @samp{@}} represents the close-interval operator; if this bit isn't +set, then @samp{\@{} represents the open-interval operator and +@samp{\@}} represents the close-interval operator. This bit is relevant +only if @code{RE_INTERVALS} is set. + +@cnindex RE_NO_BK_PARENS +@item RE_NO_BK_PARENS +If this bit is set, then @samp{(} represents the open-group operator and +@samp{)} represents the close-group operator; if this bit isn't set, then +@samp{\(} represents the open-group operator and @samp{\)} represents +the close-group operator. + +@cnindex RE_NO_BK_REFS +@item RE_NO_BK_REFS +If this bit is set, then Regex doesn't recognize @samp{\}@var{digit} as +the back reference operator; if this bit isn't set, then it does. + +@cnindex RE_NO_BK_VBAR +@item RE_NO_BK_VBAR +If this bit is set, then @samp{|} represents the alternation operator; +if this bit isn't set, then @samp{\|} represents the alternation +operator. This bit is irrelevant if @code{RE_LIMITED_OPS} is set. + +@cnindex RE_NO_EMPTY_RANGES +@item RE_NO_EMPTY_RANGES +If this bit is set, then a regular expression with a range whose ending +point collates lower than its starting point is invalid; if this bit +isn't set, then Regex considers such a range to be empty. + +@cnindex RE_UNMATCHED_RIGHT_PAREN_ORD +@item RE_UNMATCHED_RIGHT_PAREN_ORD +If this bit is set and the regular expression has no matching open-group +operator, then Regex considers what would otherwise be a close-group +operator (based on how @code{RE_NO_BK_PARENS} is set) to match @samp{)}. + +@end table + + +@node Predefined Syntaxes, Collating Elements vs. Characters, Syntax Bits, Regular Expression Syntax +@section Predefined Syntaxes + +If you're programming with Regex, you can set a pattern buffer's +(@pxref{GNU Pattern Buffers}, and @ref{POSIX Pattern Buffers}) +@code{syntax} field either to an arbitrary combination of syntax bits +(@pxref{Syntax Bits}) or else to the configurations defined by Regex. +These configurations define the syntaxes used by certain +programs---@sc{gnu} Emacs, +@cindex Emacs +@sc{posix} Awk, +@cindex POSIX Awk +traditional Awk, +@cindex Awk +Grep, +@cindex Grep +@cindex Egrep +Egrep---in addition to syntaxes for @sc{posix} basic and extended +regular expressions. + +The predefined syntaxes--taken directly from @file{regex.h}---are: + +@example +#define RE_SYNTAX_EMACS 0 + +#define RE_SYNTAX_AWK \ + (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ + | RE_UNMATCHED_RIGHT_PAREN_ORD) + +#define RE_SYNTAX_POSIX_AWK \ + (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) + +#define RE_SYNTAX_GREP \ + (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ + | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ + | RE_NEWLINE_ALT) + +#define RE_SYNTAX_EGREP \ + (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ + | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ + | RE_NO_BK_VBAR) + +#define RE_SYNTAX_POSIX_EGREP \ + (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) + +/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ +#define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC + +#define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC + +/* Syntax bits common to both basic and extended POSIX regex syntax. */ +#define _RE_SYNTAX_POSIX_COMMON \ + (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ + | RE_INTERVALS | RE_NO_EMPTY_RANGES) + +#define RE_SYNTAX_POSIX_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) + +/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes + RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this + isn't minimal, since other operators, such as \`, aren't disabled. */ +#define RE_SYNTAX_POSIX_MINIMAL_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) + +#define RE_SYNTAX_POSIX_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ + | RE_UNMATCHED_RIGHT_PAREN_ORD) + +/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS + replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ +#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) +@end example + +@node Collating Elements vs. Characters, The Backslash Character, Predefined Syntaxes, Regular Expression Syntax +@section Collating Elements vs.@: Characters + +@sc{posix} generalizes the notion of a character to that of a +collating element. It defines a @dfn{collating element} to be ``a +sequence of one or more bytes defined in the current collating sequence +as a unit of collation.'' + +This generalizes the notion of a character in +two ways. First, a single character can map into two or more collating +elements. For example, the German +@tex +`\ss' +@end tex +@ifinfo +``es-zet'' +@end ifinfo +collates as the collating element @samp{s} followed by another collating +element @samp{s}. Second, two or more characters can map into one +collating element. For example, the Spanish @samp{ll} collates after +@samp{l} and before @samp{m}. + +Since @sc{posix}'s ``collating element'' preserves the essential idea of +a ``character,'' we use the latter, more familiar, term in this document. + +@node The Backslash Character, , Collating Elements vs. Characters, Regular Expression Syntax +@section The Backslash Character + +@cindex \ +The @samp{\} character has one of four different meanings, depending on +the context in which you use it and what syntax bits are set +(@pxref{Syntax Bits}). It can: 1) stand for itself, 2) quote the next +character, 3) introduce an operator, or 4) do nothing. + +@enumerate +@item +It stands for itself inside a list +(@pxref{List Operators}) if the syntax bit +@code{RE_BACKSLASH_ESCAPE_IN_LISTS} is not set. For example, @samp{[\]} +would match @samp{\}. + +@item +It quotes (makes ordinary, if it's special) the next character when you +use it either: + +@itemize @bullet +@item +outside a list,@footnote{Sometimes +you don't have to explicitly quote special characters to make +them ordinary. For instance, most characters lose any special meaning +inside a list (@pxref{List Operators}). In addition, if the syntax bits +@code{RE_CONTEXT_INVALID_OPS} and @code{RE_CONTEXT_INDEP_OPS} +aren't set, then (for historical reasons) the matcher considers special +characters ordinary if they are in contexts where the operations they +represent make no sense; for example, then the match-zero-or-more +operator (represented by @samp{*}) matches itself in the regular +expression @samp{*foo} because there is no preceding expression on which +it can operate. It is poor practice, however, to depend on this +behavior; if you want a special character to be ordinary outside a list, +it's better to always quote it, regardless.} or + +@item +inside a list and the syntax bit @code{RE_BACKSLASH_ESCAPE_IN_LISTS} is set. + +@end itemize + +@item +It introduces an operator when followed by certain ordinary +characters---sometimes only when certain syntax bits are set. See the +cases @code{RE_BK_PLUS_QM}, @code{RE_NO_BK_BRACES}, @code{RE_NO_BK_VAR}, +@code{RE_NO_BK_PARENS}, @code{RE_NO_BK_REF} in @ref{Syntax Bits}. Also: + +@itemize @bullet +@item +@samp{\b} represents the match-word-boundary operator +(@pxref{Match-word-boundary Operator}). + +@item +@samp{\B} represents the match-within-word operator +(@pxref{Match-within-word Operator}). + +@item +@samp{\<} represents the match-beginning-of-word operator @* +(@pxref{Match-beginning-of-word Operator}). + +@item +@samp{\>} represents the match-end-of-word operator +(@pxref{Match-end-of-word Operator}). + +@item +@samp{\w} represents the match-word-constituent operator +(@pxref{Match-word-constituent Operator}). + +@item +@samp{\W} represents the match-non-word-constituent operator +(@pxref{Match-non-word-constituent Operator}). + +@item +@samp{\`} represents the match-beginning-of-buffer +operator and @samp{\'} represents the match-end-of-buffer operator +(@pxref{Buffer Operators}). + +@item +If Regex was compiled with the C preprocessor symbol @code{emacs} +defined, then @samp{\s@var{class}} represents the match-syntactic-class +operator and @samp{\S@var{class}} represents the +match-not-syntactic-class operator (@pxref{Syntactic Class Operators}). + +@end itemize + +@item +In all other cases, Regex ignores @samp{\}. For example, +@samp{\n} matches @samp{n}. + +@end enumerate + +@node Common Operators, GNU Operators, Regular Expression Syntax, Top +@chapter Common Operators + +You compose regular expressions from operators. In the following +sections, we describe the regular expression operators specified by +@sc{posix}; @sc{gnu} also uses these. Most operators have more than one +representation as characters. @xref{Regular Expression Syntax}, for +what characters represent what operators under what circumstances. + +For most operators that can be represented in two ways, one +representation is a single character and the other is that character +preceded by @samp{\}. For example, either @samp{(} or @samp{\(} +represents the open-group operator. Which one does depends on the +setting of a syntax bit, in this case @code{RE_NO_BK_PARENS}. Why is +this so? Historical reasons dictate some of the varying +representations, while @sc{posix} dictates others. + +Finally, almost all characters lose any special meaning inside a list +(@pxref{List Operators}). + +@menu +* Match-self Operator:: Ordinary characters. +* Match-any-character Operator:: . +* Concatenation Operator:: Juxtaposition. +* Repetition Operators:: * + ? @{@} +* Alternation Operator:: | +* List Operators:: [...] [^...] +* Grouping Operators:: (...) +* Back-reference Operator:: \digit +* Anchoring Operators:: ^ $ +@end menu + +@node Match-self Operator, Match-any-character Operator, , Common Operators +@section The Match-self Operator (@var{ordinary character}) + +This operator matches the character itself. All ordinary characters +(@pxref{Regular Expression Syntax}) represent this operator. For +example, @samp{f} is always an ordinary character, so the regular +expression @samp{f} matches only the string @samp{f}. In +particular, it does @emph{not} match the string @samp{ff}. + +@node Match-any-character Operator, Concatenation Operator, Match-self Operator, Common Operators +@section The Match-any-character Operator (@code{.}) + +@cindex @samp{.} + +This operator matches any single printing or nonprinting character +except it won't match a: + +@table @asis +@item newline +if the syntax bit @code{RE_DOT_NEWLINE} isn't set. + +@item null +if the syntax bit @code{RE_DOT_NOT_NULL} is set. + +@end table + +The @samp{.} (period) character represents this operator. For example, +@samp{a.b} matches any three-character string beginning with @samp{a} +and ending with @samp{b}. + +@node Concatenation Operator, Repetition Operators, Match-any-character Operator, Common Operators +@section The Concatenation Operator + +This operator concatenates two regular expressions @var{a} and @var{b}. +No character represents this operator; you simply put @var{b} after +@var{a}. The result is a regular expression that will match a string if +@var{a} matches its first part and @var{b} matches the rest. For +example, @samp{xy} (two match-self operators) matches @samp{xy}. + +@node Repetition Operators, Alternation Operator, Concatenation Operator, Common Operators +@section Repetition Operators + +Repetition operators repeat the preceding regular expression a specified +number of times. + +@menu +* Match-zero-or-more Operator:: * +* Match-one-or-more Operator:: + +* Match-zero-or-one Operator:: ? +* Interval Operators:: @{@} +@end menu + +@node Match-zero-or-more Operator, Match-one-or-more Operator, , Repetition Operators +@subsection The Match-zero-or-more Operator (@code{*}) + +@cindex @samp{*} + +This operator repeats the smallest possible preceding regular expression +as many times as necessary (including zero) to match the pattern. +@samp{*} represents this operator. For example, @samp{o*} +matches any string made up of zero or more @samp{o}s. Since this +operator operates on the smallest preceding regular expression, +@samp{fo*} has a repeating @samp{o}, not a repeating @samp{fo}. So, +@samp{fo*} matches @samp{f}, @samp{fo}, @samp{foo}, and so on. + +Since the match-zero-or-more operator is a suffix operator, it may be +useless as such when no regular expression precedes it. This is the +case when it: + +@itemize @bullet +@item +is first in a regular expression, or + +@item +follows a match-beginning-of-line, open-group, or alternation +operator. + +@end itemize + +@noindent +Three different things can happen in these cases: + +@enumerate +@item +If the syntax bit @code{RE_CONTEXT_INVALID_OPS} is set, then the +regular expression is invalid. + +@item +If @code{RE_CONTEXT_INVALID_OPS} isn't set, but +@code{RE_CONTEXT_INDEP_OPS} is, then @samp{*} represents the +match-zero-or-more operator (which then operates on the empty string). + +@item +Otherwise, @samp{*} is ordinary. + +@end enumerate + +@cindex backtracking +The matcher processes a match-zero-or-more operator by first matching as +many repetitions of the smallest preceding regular expression as it can. +Then it continues to match the rest of the pattern. + +If it can't match the rest of the pattern, it backtracks (as many times +as necessary), each time discarding one of the matches until it can +either match the entire pattern or be certain that it cannot get a +match. For example, when matching @samp{ca*ar} against @samp{caaar}, +the matcher first matches all three @samp{a}s of the string with the +@samp{a*} of the regular expression. However, it cannot then match the +final @samp{ar} of the regular expression against the final @samp{r} of +the string. So it backtracks, discarding the match of the last @samp{a} +in the string. It can then match the remaining @samp{ar}. + + +@node Match-one-or-more Operator, Match-zero-or-one Operator, Match-zero-or-more Operator, Repetition Operators +@subsection The Match-one-or-more Operator (@code{+} or @code{\+}) + +@cindex @samp{+} + +If the syntax bit @code{RE_LIMITED_OPS} is set, then Regex doesn't recognize +this operator. Otherwise, if the syntax bit @code{RE_BK_PLUS_QM} isn't +set, then @samp{+} represents this operator; if it is, then @samp{\+} +does. + +This operator is similar to the match-zero-or-more operator except that +it repeats the preceding regular expression at least once; +@pxref{Match-zero-or-more Operator}, for what it operates on, how some +syntax bits affect it, and how Regex backtracks to match it. + +For example, supposing that @samp{+} represents the match-one-or-more +operator; then @samp{ca+r} matches, e.g., @samp{car} and +@samp{caaaar}, but not @samp{cr}. + +@node Match-zero-or-one Operator, Interval Operators, Match-one-or-more Operator, Repetition Operators +@subsection The Match-zero-or-one Operator (@code{?} or @code{\?}) +@cindex @samp{?} + +If the syntax bit @code{RE_LIMITED_OPS} is set, then Regex doesn't +recognize this operator. Otherwise, if the syntax bit +@code{RE_BK_PLUS_QM} isn't set, then @samp{?} represents this operator; +if it is, then @samp{\?} does. + +This operator is similar to the match-zero-or-more operator except that +it repeats the preceding regular expression once or not at all; +@pxref{Match-zero-or-more Operator}, to see what it operates on, how +some syntax bits affect it, and how Regex backtracks to match it. + +For example, supposing that @samp{?} represents the match-zero-or-one +operator; then @samp{ca?r} matches both @samp{car} and @samp{cr}, but +nothing else. + +@node Interval Operators, , Match-zero-or-one Operator, Repetition Operators +@subsection Interval Operators (@code{@{} @dots{} @code{@}} or @code{\@{} @dots{} @code{\@}}) + +@cindex interval expression +@cindex @samp{@{} +@cindex @samp{@}} +@cindex @samp{\@{} +@cindex @samp{\@}} + +If the syntax bit @code{RE_INTERVALS} is set, then Regex recognizes +@dfn{interval expressions}. They repeat the smallest possible preceding +regular expression a specified number of times. + +If the syntax bit @code{RE_NO_BK_BRACES} is set, @samp{@{} represents +the @dfn{open-interval operator} and @samp{@}} represents the +@dfn{close-interval operator} ; otherwise, @samp{\@{} and @samp{\@}} do. + +Specifically, supposing that @samp{@{} and @samp{@}} represent the +open-interval and close-interval operators; then: + +@table @code +@item @{@var{count}@} +matches exactly @var{count} occurrences of the preceding regular +expression. + +@item @{@var{min,}@} +matches @var{min} or more occurrences of the preceding regular +expression. + +@item @{@var{min, max}@} +matches at least @var{min} but no more than @var{max} occurrences of +the preceding regular expression. + +@end table + +The interval expression (but not necessarily the regular expression that +contains it) is invalid if: + +@itemize @bullet +@item +@var{min} is greater than @var{max}, or + +@item +any of @var{count}, @var{min}, or @var{max} are outside the range +zero to @code{RE_DUP_MAX} (which symbol @file{regex.h} +defines). + +@end itemize + +If the interval expression is invalid and the syntax bit +@code{RE_NO_BK_BRACES} is set, then Regex considers all the +characters in the would-be interval to be ordinary. If that bit +isn't set, then the regular expression is invalid. + +If the interval expression is valid but there is no preceding regular +expression on which to operate, then if the syntax bit +@code{RE_CONTEXT_INVALID_OPS} is set, the regular expression is invalid. +If that bit isn't set, then Regex considers all the characters---other +than backslashes, which it ignores---in the would-be interval to be +ordinary. + + +@node Alternation Operator, List Operators, Repetition Operators, Common Operators +@section The Alternation Operator (@code{|} or @code{\|}) + +@kindex | +@kindex \| +@cindex alternation operator +@cindex or operator + +If the syntax bit @code{RE_LIMITED_OPS} is set, then Regex doesn't +recognize this operator. Otherwise, if the syntax bit +@code{RE_NO_BK_VBAR} is set, then @samp{|} represents this operator; +otherwise, @samp{\|} does. + +Alternatives match one of a choice of regular expressions: +if you put the character(s) representing the alternation operator between +any two regular expressions @var{a} and @var{b}, the result matches +the union of the strings that @var{a} and @var{b} match. For +example, supposing that @samp{|} is the alternation operator, then +@samp{foo|bar|quux} would match any of @samp{foo}, @samp{bar} or +@samp{quux}. + +@ignore +@c Nobody needs to disallow empty alternatives any more. +If the syntax bit @code{RE_NO_EMPTY_ALTS} is set, then if either of the regular +expressions @var{a} or @var{b} is empty, the +regular expression is invalid. More precisely, if this syntax bit is +set, then the alternation operator can't: + +@itemize @bullet +@item +be first or last in a regular expression; + +@item +follow either another alternation operator or an open-group operator +(@pxref{Grouping Operators}); or + +@item +precede a close-group operator. + +@end itemize + +@noindent +For example, supposing @samp{(} and @samp{)} represent the open and +close-group operators, then @samp{|foo}, @samp{foo|}, @samp{foo||bar}, +@samp{foo(|bar)}, and @samp{(foo|)bar} would all be invalid. +@end ignore + +The alternation operator operates on the @emph{largest} possible +surrounding regular expressions. (Put another way, it has the lowest +precedence of any regular expression operator.) +Thus, the only way you can +delimit its arguments is to use grouping. For example, if @samp{(} and +@samp{)} are the open and close-group operators, then @samp{fo(o|b)ar} +would match either @samp{fooar} or @samp{fobar}. (@samp{foo|bar} would +match @samp{foo} or @samp{bar}.) + +@cindex backtracking +The matcher usually tries all combinations of alternatives so as to +match the longest possible string. For example, when matching +@samp{(fooq|foo)*(qbarquux|bar)} against @samp{fooqbarquux}, it cannot +take, say, the first (``depth-first'') combination it could match, since +then it would be content to match just @samp{fooqbar}. + +@comment xx something about leftmost-longest + + +@node List Operators, Grouping Operators, Alternation Operator, Common Operators +@section List Operators (@code{[} @dots{} @code{]} and @code{[^} @dots{} @code{]}) + +@cindex matching list +@cindex @samp{[} +@cindex @samp{]} +@cindex @samp{^} +@cindex @samp{-} +@cindex @samp{\} +@cindex @samp{[^} +@cindex nonmatching list +@cindex matching newline +@cindex bracket expression + +@dfn{Lists}, also called @dfn{bracket expressions}, are a set of one or +more items. An @dfn{item} is a character, +@ignore +(These get added when they get implemented.) +a collating symbol, an equivalence class expression, +@end ignore +a character class expression, or a range expression. The syntax bits +affect which kinds of items you can put in a list. We explain the last +two items in subsections below. Empty lists are invalid. + +A @dfn{matching list} matches a single character represented by one of +the list items. You form a matching list by enclosing one or more items +within an @dfn{open-matching-list operator} (represented by @samp{[}) +and a @dfn{close-list operator} (represented by @samp{]}). + +For example, @samp{[ab]} matches either @samp{a} or @samp{b}. +@samp{[ad]*} matches the empty string and any string composed of just +@samp{a}s and @samp{d}s in any order. Regex considers invalid a regular +expression with a @samp{[} but no matching +@samp{]}. + +@dfn{Nonmatching lists} are similar to matching lists except that they +match a single character @emph{not} represented by one of the list +items. You use an @dfn{open-nonmatching-list operator} (represented by +@samp{[^}@footnote{Regex therefore doesn't consider the @samp{^} to be +the first character in the list. If you put a @samp{^} character first +in (what you think is) a matching list, you'll turn it into a +nonmatching list.}) instead of an open-matching-list operator to start a +nonmatching list. + +For example, @samp{[^ab]} matches any character except @samp{a} or +@samp{b}. + +If the @code{posix_newline} field in the pattern buffer (@pxref{GNU +Pattern Buffers} is set, then nonmatching lists do not match a newline. + +Most characters lose any special meaning inside a list. The special +characters inside a list follow. + +@table @samp +@item ] +ends the list if it's not the first list item. So, if you want to make +the @samp{]} character a list item, you must put it first. + +@item \ +quotes the next character if the syntax bit @code{RE_BACKSLASH_ESCAPE_IN_LISTS} is +set. + +@ignore +Put these in if they get implemented. + +@item [. +represents the open-collating-symbol operator (@pxref{Collating Symbol +Operators}). + +@item .] +represents the close-collating-symbol operator. + +@item [= +represents the open-equivalence-class operator (@pxref{Equivalence Class +Operators}). + +@item =] +represents the close-equivalence-class operator. + +@end ignore + +@item [: +represents the open-character-class operator (@pxref{Character Class +Operators}) if the syntax bit @code{RE_CHAR_CLASSES} is set and what +follows is a valid character class expression. + +@item :] +represents the close-character-class operator if the syntax bit +@code{RE_CHAR_CLASSES} is set and what precedes it is an +open-character-class operator followed by a valid character class name. + +@item - +represents the range operator (@pxref{Range Operator}) if it's +not first or last in a list or the ending point of a range. + +@end table + +@noindent +All other characters are ordinary. For example, @samp{[.*]} matches +@samp{.} and @samp{*}. + +@menu +* Character Class Operators:: [:class:] +* Range Operator:: start-end +@end menu + +@ignore +(If collating symbols and equivalence class expressions get implemented, +then add this.) + +node Collating Symbol Operators +subsubsection Collating Symbol Operators (@code{[.} @dots{} @code{.]}) + +If the syntax bit @code{XX} is set, then you can represent +collating symbols inside lists. You form a @dfn{collating symbol} by +putting a collating element between an @dfn{open-collating-symbol +operator} and an @dfn{close-collating-symbol operator}. @samp{[.} +represents the open-collating-symbol operator and @samp{.]} represents +the close-collating-symbol operator. For example, if @samp{ll} is a +collating element, then @samp{[[.ll.]]} would match @samp{ll}. + +node Equivalence Class Operators +subsubsection Equivalence Class Operators (@code{[=} @dots{} @code{=]}) +@cindex equivalence class expression in regex +@cindex @samp{[=} in regex +@cindex @samp{=]} in regex + +If the syntax bit @code{XX} is set, then Regex recognizes equivalence class +expressions inside lists. A @dfn{equivalence class expression} is a set +of collating elements which all belong to the same equivalence class. +You form an equivalence class expression by putting a collating +element between an @dfn{open-equivalence-class operator} and a +@dfn{close-equivalence-class operator}. @samp{[=} represents the +open-equivalence-class operator and @samp{=]} represents the +close-equivalence-class operator. For example, if @samp{a} and @samp{A} +were an equivalence class, then both @samp{[[=a=]]} and @samp{[[=A=]]} +would match both @samp{a} and @samp{A}. If the collating element in an +equivalence class expression isn't part of an equivalence class, then +the matcher considers the equivalence class expression to be a collating +symbol. + +@end ignore + +@node Character Class Operators, Range Operator, , List Operators +@subsection Character Class Operators (@code{[:} @dots{} @code{:]}) + +@cindex character classes +@cindex @samp{[:} in regex +@cindex @samp{:]} in regex + +If the syntax bit @code{RE_CHARACTER_CLASSES} is set, then Regex +recognizes character class expressions inside lists. A @dfn{character +class expression} matches one character from a given class. You form a +character class expression by putting a character class name between an +@dfn{open-character-class operator} (represented by @samp{[:}) and a +@dfn{close-character-class operator} (represented by @samp{:]}). The +character class names and their meanings are: + +@table @code + +@item alnum +letters and digits + +@item alpha +letters + +@item blank +system-dependent; for @sc{gnu}, a space or tab + +@item cntrl +control characters (in the @sc{ascii} encoding, code 0177 and codes +less than 040) + +@item digit +digits + +@item graph +same as @code{print} except omits space + +@item lower +lowercase letters + +@item print +printable characters (in the @sc{ascii} encoding, space +tilde---codes 040 through 0176) + +@item punct +neither control nor alphanumeric characters + +@item space +space, carriage return, newline, vertical tab, and form feed + +@item upper +uppercase letters + +@item xdigit +hexadecimal digits: @code{0}--@code{9}, @code{a}--@code{f}, @code{A}--@code{F} + +@end table + +@noindent +These correspond to the definitions in the C library's @file{<ctype.h>} +facility. For example, @samp{[:alpha:]} corresponds to the standard +facility @code{isalpha}. Regex recognizes character class expressions +only inside of lists; so @samp{[[:alpha:]]} matches any letter, but +@samp{[:alpha:]} outside of a bracket expression and not followed by a +repetition operator matches just itself. + +@node Range Operator, , Character Class Operators, List Operators +@subsection The Range Operator (@code{-}) + +Regex recognizes @dfn{range expressions} inside a list. They represent +those characters +that fall between two elements in the current collating sequence. You +form a range expression by putting a @dfn{range operator} between two +@ignore +(If these get implemented, then substitute this for ``characters.'') +of any of the following: characters, collating elements, collating symbols, +and equivalence class expressions. The starting point of the range and +the ending point of the range don't have to be the same kind of item, +e.g., the starting point could be a collating element and the ending +point could be an equivalence class expression. If a range's ending +point is an equivalence class, then all the collating elements in that +class will be in the range. +@end ignore +characters.@footnote{You can't use a character class for the starting +or ending point of a range, since a character class is not a single +character.} @samp{-} represents the range operator. For example, +@samp{a-f} within a list represents all the characters from @samp{a} +through @samp{f} +inclusively. + +If the syntax bit @code{RE_NO_EMPTY_RANGES} is set, then if the range's +ending point collates less than its starting point, the range (and the +regular expression containing it) is invalid. For example, the regular +expression @samp{[z-a]} would be invalid. If this bit isn't set, then +Regex considers such a range to be empty. + +Since @samp{-} represents the range operator, if you want to make a +@samp{-} character itself +a list item, you must do one of the following: + +@itemize @bullet +@item +Put the @samp{-} either first or last in the list. + +@item +Include a range whose starting point collates strictly lower than +@samp{-} and whose ending point collates equal or higher. Unless a +range is the first item in a list, a @samp{-} can't be its starting +point, but @emph{can} be its ending point. That is because Regex +considers @samp{-} to be the range operator unless it is preceded by +another @samp{-}. For example, in the @sc{ascii} encoding, @samp{)}, +@samp{*}, @samp{+}, @samp{,}, @samp{-}, @samp{.}, and @samp{/} are +contiguous characters in the collating sequence. You might think that +@samp{[)-+--/]} has two ranges: @samp{)-+} and @samp{--/}. Rather, it +has the ranges @samp{)-+} and @samp{+--}, plus the character @samp{/}, so +it matches, e.g., @samp{,}, not @samp{.}. + +@item +Put a range whose starting point is @samp{-} first in the list. + +@end itemize + +For example, @samp{[-a-z]} matches a lowercase letter or a hyphen (in +English, in @sc{ascii}). + + +@node Grouping Operators, Back-reference Operator, List Operators, Common Operators +@section Grouping Operators (@code{(} @dots{} @code{)} or @code{\(} @dots{} @code{\)}) + +@kindex ( +@kindex ) +@kindex \( +@kindex \) +@cindex grouping +@cindex subexpressions +@cindex parenthesizing + +A @dfn{group}, also known as a @dfn{subexpression}, consists of an +@dfn{open-group operator}, any number of other operators, and a +@dfn{close-group operator}. Regex treats this sequence as a unit, just +as mathematics and programming languages treat a parenthesized +expression as a unit. + +Therefore, using @dfn{groups}, you can: + +@itemize @bullet +@item +delimit the argument(s) to an alternation operator (@pxref{Alternation +Operator}) or a repetition operator (@pxref{Repetition +Operators}). + +@item +keep track of the indices of the substring that matched a given group. +@xref{Using Registers}, for a precise explanation. +This lets you: + +@itemize @bullet +@item +use the back-reference operator (@pxref{Back-reference Operator}). + +@item +use registers (@pxref{Using Registers}). + +@end itemize + +@end itemize + +If the syntax bit @code{RE_NO_BK_PARENS} is set, then @samp{(} represents +the open-group operator and @samp{)} represents the +close-group operator; otherwise, @samp{\(} and @samp{\)} do. + +If the syntax bit @code{RE_UNMATCHED_RIGHT_PAREN_ORD} is set and a +close-group operator has no matching open-group operator, then Regex +considers it to match @samp{)}. + + +@node Back-reference Operator, Anchoring Operators, Grouping Operators, Common Operators +@section The Back-reference Operator (@dfn{\}@var{digit}) + +@cindex back references + +If the syntax bit @code{RE_NO_BK_REF} isn't set, then Regex recognizes +back references. A back reference matches a specified preceding group. +The back reference operator is represented by @samp{\@var{digit}} +anywhere after the end of a regular expression's @w{@var{digit}-th} +group (@pxref{Grouping Operators}). + +@var{digit} must be between @samp{1} and @samp{9}. The matcher assigns +numbers 1 through 9 to the first nine groups it encounters. By using +one of @samp{\1} through @samp{\9} after the corresponding group's +close-group operator, you can match a substring identical to the +one that the group does. + +Back references match according to the following (in all examples below, +@samp{(} represents the open-group, @samp{)} the close-group, @samp{@{} +the open-interval and @samp{@}} the close-interval operator): + +@itemize @bullet +@item +If the group matches a substring, the back reference matches an +identical substring. For example, @samp{(a)\1} matches @samp{aa} and +@samp{(bana)na\1bo\1} matches @samp{bananabanabobana}. Likewise, +@samp{(.*)\1} matches any (newline-free if the syntax bit +@code{RE_DOT_NEWLINE} isn't set) string that is composed of two +identical halves; the @samp{(.*)} matches the first half and the +@samp{\1} matches the second half. + +@item +If the group matches more than once (as it might if followed +by, e.g., a repetition operator), then the back reference matches the +substring the group @emph{last} matched. For example, +@samp{((a*)b)*\1\2} matches @samp{aabababa}; first @w{group 1} (the +outer one) matches @samp{aab} and @w{group 2} (the inner one) matches +@samp{aa}. Then @w{group 1} matches @samp{ab} and @w{group 2} matches +@samp{a}. So, @samp{\1} matches @samp{ab} and @samp{\2} matches +@samp{a}. + +@item +If the group doesn't participate in a match, i.e., it is part of an +alternative not taken or a repetition operator allows zero repetitions +of it, then the back reference makes the whole match fail. For example, +@samp{(one()|two())-and-(three\2|four\3)} matches @samp{one-and-three} +and @samp{two-and-four}, but not @samp{one-and-four} or +@samp{two-and-three}. For example, if the pattern matches +@samp{one-and-}, then its @w{group 2} matches the empty string and its +@w{group 3} doesn't participate in the match. So, if it then matches +@samp{four}, then when it tries to back reference @w{group 3}---which it +will attempt to do because @samp{\3} follows the @samp{four}---the match +will fail because @w{group 3} didn't participate in the match. + +@end itemize + +You can use a back reference as an argument to a repetition operator. For +example, @samp{(a(b))\2*} matches @samp{a} followed by two or more +@samp{b}s. Similarly, @samp{(a(b))\2@{3@}} matches @samp{abbbb}. + +If there is no preceding @w{@var{digit}-th} subexpression, the regular +expression is invalid. + + +@node Anchoring Operators, , Back-reference Operator, Common Operators +@section Anchoring Operators + +@cindex anchoring +@cindex regexp anchoring + +These operators can constrain a pattern to match only at the beginning or +end of the entire string or at the beginning or end of a line. + +@menu +* Match-beginning-of-line Operator:: ^ +* Match-end-of-line Operator:: $ +@end menu + + +@node Match-beginning-of-line Operator, Match-end-of-line Operator, , Anchoring Operators +@subsection The Match-beginning-of-line Operator (@code{^}) + +@kindex ^ +@cindex beginning-of-line operator +@cindex anchors + +This operator can match the empty string either at the beginning of the +string or after a newline character. Thus, it is said to @dfn{anchor} +the pattern to the beginning of a line. + +In the cases following, @samp{^} represents this operator. (Otherwise, +@samp{^} is ordinary.) + +@itemize @bullet + +@item +It (the @samp{^}) is first in the pattern, as in @samp{^foo}. + +@cnindex RE_CONTEXT_INDEP_ANCHORS @r{(and @samp{^})} +@item +The syntax bit @code{RE_CONTEXT_INDEP_ANCHORS} is set, and it is outside +a bracket expression. + +@cindex open-group operator and @samp{^} +@cindex alternation operator and @samp{^} +@item +It follows an open-group or alternation operator, as in @samp{a\(^b\)} +and @samp{a\|^b}. @xref{Grouping Operators}, and @ref{Alternation +Operator}. + +@end itemize + +These rules imply that some valid patterns containing @samp{^} cannot be +matched; for example, @samp{foo^bar} if @code{RE_CONTEXT_INDEP_ANCHORS} +is set. + +@vindex not_bol @r{field in pattern buffer} +If the @code{not_bol} field is set in the pattern buffer (@pxref{GNU +Pattern Buffers}), then @samp{^} fails to match at the beginning of the +string. @xref{POSIX Matching}, for when you might find this useful. + +@vindex newline_anchor @r{field in pattern buffer} +If the @code{newline_anchor} field is set in the pattern buffer, then +@samp{^} fails to match after a newline. This is useful when you do not +regard the string to be matched as broken into lines. + + +@node Match-end-of-line Operator, , Match-beginning-of-line Operator, Anchoring Operators +@subsection The Match-end-of-line Operator (@code{$}) + +@kindex $ +@cindex end-of-line operator +@cindex anchors + +This operator can match the empty string either at the end of +the string or before a newline character in the string. Thus, it is +said to @dfn{anchor} the pattern to the end of a line. + +It is always represented by @samp{$}. For example, @samp{foo$} usually +matches, e.g., @samp{foo} and, e.g., the first three characters of +@samp{foo\nbar}. + +Its interaction with the syntax bits and pattern buffer fields is +exactly the dual of @samp{^}'s; see the previous section. (That is, +``beginning'' becomes ``end'', ``next'' becomes ``previous'', and +``after'' becomes ``before''.) + + +@node GNU Operators, GNU Emacs Operators, Common Operators, Top +@chapter GNU Operators + +Following are operators that @sc{gnu} defines (and @sc{posix} doesn't). + +@menu +* Word Operators:: +* Buffer Operators:: +@end menu + +@node Word Operators, Buffer Operators, , GNU Operators +@section Word Operators + +The operators in this section require Regex to recognize parts of words. +Regex uses a syntax table to determine whether or not a character is +part of a word, i.e., whether or not it is @dfn{word-constituent}. + +@menu +* Non-Emacs Syntax Tables:: +* Match-word-boundary Operator:: \b +* Match-within-word Operator:: \B +* Match-beginning-of-word Operator:: \< +* Match-end-of-word Operator:: \> +* Match-word-constituent Operator:: \w +* Match-non-word-constituent Operator:: \W +@end menu + +@node Non-Emacs Syntax Tables, Match-word-boundary Operator, , Word Operators +@subsection Non-Emacs Syntax Tables + +A @dfn{syntax table} is an array indexed by the characters in your +character set. In the @sc{ascii} encoding, therefore, a syntax table +has 256 elements. Regex always uses a @code{char *} variable +@code{re_syntax_table} as its syntax table. In some cases, it +initializes this variable and in others it expects you to initialize it. + +@itemize @bullet +@item +If Regex is compiled with the preprocessor symbols @code{emacs} and +@code{SYNTAX_TABLE} both undefined, then Regex allocates +@code{re_syntax_table} and initializes an element @var{i} either to +@code{Sword} (which it defines) if @var{i} is a letter, number, or +@samp{_}, or to zero if it's not. + +@item +If Regex is compiled with @code{emacs} undefined but @code{SYNTAX_TABLE} +defined, then Regex expects you to define a @code{char *} variable +@code{re_syntax_table} to be a valid syntax table. + +@item +@xref{Emacs Syntax Tables}, for what happens when Regex is compiled with +the preprocessor symbol @code{emacs} defined. + +@end itemize + +@node Match-word-boundary Operator, Match-within-word Operator, Non-Emacs Syntax Tables, Word Operators +@subsection The Match-word-boundary Operator (@code{\b}) + +@cindex @samp{\b} +@cindex word boundaries, matching + +This operator (represented by @samp{\b}) matches the empty string at +either the beginning or the end of a word. For example, @samp{\brat\b} +matches the separate word @samp{rat}. + +@node Match-within-word Operator, Match-beginning-of-word Operator, Match-word-boundary Operator, Word Operators +@subsection The Match-within-word Operator (@code{\B}) + +@cindex @samp{\B} + +This operator (represented by @samp{\B}) matches the empty string within +a word. For example, @samp{c\Brat\Be} matches @samp{crate}, but +@samp{dirty \Brat} doesn't match @samp{dirty rat}. + +@node Match-beginning-of-word Operator, Match-end-of-word Operator, Match-within-word Operator, Word Operators +@subsection The Match-beginning-of-word Operator (@code{\<}) + +@cindex @samp{\<} + +This operator (represented by @samp{\<}) matches the empty string at the +beginning of a word. + +@node Match-end-of-word Operator, Match-word-constituent Operator, Match-beginning-of-word Operator, Word Operators +@subsection The Match-end-of-word Operator (@code{\>}) + +@cindex @samp{\>} + +This operator (represented by @samp{\>}) matches the empty string at the +end of a word. + +@node Match-word-constituent Operator, Match-non-word-constituent Operator, Match-end-of-word Operator, Word Operators +@subsection The Match-word-constituent Operator (@code{\w}) + +@cindex @samp{\w} + +This operator (represented by @samp{\w}) matches any word-constituent +character. + +@node Match-non-word-constituent Operator, , Match-word-constituent Operator, Word Operators +@subsection The Match-non-word-constituent Operator (@code{\W}) + +@cindex @samp{\W} + +This operator (represented by @samp{\W}) matches any character that is +not word-constituent. + + +@node Buffer Operators, , Word Operators, GNU Operators +@section Buffer Operators + +Following are operators which work on buffers. In Emacs, a @dfn{buffer} +is, naturally, an Emacs buffer. For other programs, Regex considers the +entire string to be matched as the buffer. + +@menu +* Match-beginning-of-buffer Operator:: \` +* Match-end-of-buffer Operator:: \' +@end menu + + +@node Match-beginning-of-buffer Operator, Match-end-of-buffer Operator, , Buffer Operators +@subsection The Match-beginning-of-buffer Operator (@code{\`}) + +@cindex @samp{\`} + +This operator (represented by @samp{\`}) matches the empty string at the +beginning of the buffer. + +@node Match-end-of-buffer Operator, , Match-beginning-of-buffer Operator, Buffer Operators +@subsection The Match-end-of-buffer Operator (@code{\'}) + +@cindex @samp{\'} + +This operator (represented by @samp{\'}) matches the empty string at the +end of the buffer. + + +@node GNU Emacs Operators, What Gets Matched?, GNU Operators, Top +@chapter GNU Emacs Operators + +Following are operators that @sc{gnu} defines (and @sc{posix} doesn't) +that you can use only when Regex is compiled with the preprocessor +symbol @code{emacs} defined. + +@menu +* Syntactic Class Operators:: +@end menu + + +@node Syntactic Class Operators, , , GNU Emacs Operators +@section Syntactic Class Operators + +The operators in this section require Regex to recognize the syntactic +classes of characters. Regex uses a syntax table to determine this. + +@menu +* Emacs Syntax Tables:: +* Match-syntactic-class Operator:: \sCLASS +* Match-not-syntactic-class Operator:: \SCLASS +@end menu + +@node Emacs Syntax Tables, Match-syntactic-class Operator, , Syntactic Class Operators +@subsection Emacs Syntax Tables + +A @dfn{syntax table} is an array indexed by the characters in your +character set. In the @sc{ascii} encoding, therefore, a syntax table +has 256 elements. + +If Regex is compiled with the preprocessor symbol @code{emacs} defined, +then Regex expects you to define and initialize the variable +@code{re_syntax_table} to be an Emacs syntax table. Emacs' syntax +tables are more complicated than Regex's own (@pxref{Non-Emacs Syntax +Tables}). @xref{Syntax, , Syntax, emacs, The GNU Emacs User's Manual}, +for a description of Emacs' syntax tables. + +@node Match-syntactic-class Operator, Match-not-syntactic-class Operator, Emacs Syntax Tables, Syntactic Class Operators +@subsection The Match-syntactic-class Operator (@code{\s}@var{class}) + +@cindex @samp{\s} + +This operator matches any character whose syntactic class is represented +by a specified character. @samp{\s@var{class}} represents this operator +where @var{class} is the character representing the syntactic class you +want. For example, @samp{w} represents the syntactic +class of word-constituent characters, so @samp{\sw} matches any +word-constituent character. + +@node Match-not-syntactic-class Operator, , Match-syntactic-class Operator, Syntactic Class Operators +@subsection The Match-not-syntactic-class Operator (@code{\S}@var{class}) + +@cindex @samp{\S} + +This operator is similar to the match-syntactic-class operator except +that it matches any character whose syntactic class is @emph{not} +represented by the specified character. @samp{\S@var{class}} represents +this operator. For example, @samp{w} represents the syntactic class of +word-constituent characters, so @samp{\Sw} matches any character that is +not word-constituent. + + +@node What Gets Matched?, Programming with Regex, GNU Emacs Operators, Top +@chapter What Gets Matched? + +Regex usually matches strings according to the ``leftmost longest'' +rule; that is, it chooses the longest of the leftmost matches. This +does not mean that for a regular expression containing subexpressions +that it simply chooses the longest match for each subexpression, left to +right; the overall match must also be the longest possible one. + +For example, @samp{(ac*)(c*d[ac]*)\1} matches @samp{acdacaaa}, not +@samp{acdac}, as it would if it were to choose the longest match for the +first subexpression. + + +@node Programming with Regex, Copying, What Gets Matched?, Top +@chapter Programming with Regex + +Here we describe how you use the Regex data structures and functions in +C programs. Regex has three interfaces: one designed for @sc{gnu}, one +compatible with @sc{posix} and one compatible with Berkeley @sc{unix}. + +@menu +* GNU Regex Functions:: +* POSIX Regex Functions:: +* BSD Regex Functions:: +@end menu + + +@node GNU Regex Functions, POSIX Regex Functions, , Programming with Regex +@section GNU Regex Functions + +If you're writing code that doesn't need to be compatible with either +@sc{posix} or Berkeley @sc{unix}, you can use these functions. They +provide more options than the other interfaces. + +@menu +* GNU Pattern Buffers:: The re_pattern_buffer type. +* GNU Regular Expression Compiling:: re_compile_pattern () +* GNU Matching:: re_match () +* GNU Searching:: re_search () +* Matching/Searching with Split Data:: re_match_2 (), re_search_2 () +* Searching with Fastmaps:: re_compile_fastmap () +* GNU Translate Tables:: The `translate' field. +* Using Registers:: The re_registers type and related fns. +* Freeing GNU Pattern Buffers:: regfree () +@end menu + + +@node GNU Pattern Buffers, GNU Regular Expression Compiling, , GNU Regex Functions +@subsection GNU Pattern Buffers + +@cindex pattern buffer, definition of +@tindex re_pattern_buffer @r{definition} +@tindex struct re_pattern_buffer @r{definition} + +To compile, match, or search for a given regular expression, you must +supply a pattern buffer. A @dfn{pattern buffer} holds one compiled +regular expression.@footnote{Regular expressions are also referred to as +``patterns,'' hence the name ``pattern buffer.''} + +You can have several different pattern buffers simultaneously, each +holding a compiled pattern for a different regular expression. + +@file{regex.h} defines the pattern buffer @code{struct} as follows: + +@example + /* Space that holds the compiled pattern. It is declared as + `unsigned char *' because its elements are + sometimes used as array indexes. */ + unsigned char *buffer; + + /* Number of bytes to which `buffer' points. */ + unsigned long allocated; + + /* Number of bytes actually used in `buffer'. */ + unsigned long used; + + /* Syntax setting with which the pattern was compiled. */ + reg_syntax_t syntax; + + /* Pointer to a fastmap, if any, otherwise zero. re_search uses + the fastmap, if there is one, to skip over impossible + starting points for matches. */ + char *fastmap; + + /* Either a translate table to apply to all characters before + comparing them, or zero for no translation. The translation + is applied to a pattern when it is compiled and to a string + when it is matched. */ + char *translate; + + /* Number of subexpressions found by the compiler. */ + size_t re_nsub; + + /* Zero if this pattern cannot match the empty string, one else. + Well, in truth it's used only in `re_search_2', to see + whether or not we should use the fastmap, so we don't set + this absolutely perfectly; see `re_compile_fastmap' (the + `duplicate' case). */ + unsigned can_be_null : 1; + + /* If REGS_UNALLOCATED, allocate space in the `regs' structure + for `max (RE_NREGS, re_nsub + 1)' groups. + If REGS_REALLOCATE, reallocate space if necessary. + If REGS_FIXED, use what's there. */ +#define REGS_UNALLOCATED 0 +#define REGS_REALLOCATE 1 +#define REGS_FIXED 2 + unsigned regs_allocated : 2; + + /* Set to zero when `regex_compile' compiles a pattern; set to one + by `re_compile_fastmap' if it updates the fastmap. */ + unsigned fastmap_accurate : 1; + + /* If set, `re_match_2' does not return information about + subexpressions. */ + unsigned no_sub : 1; + + /* If set, a beginning-of-line anchor doesn't match at the + beginning of the string. */ + unsigned not_bol : 1; + + /* Similarly for an end-of-line anchor. */ + unsigned not_eol : 1; + + /* If true, an anchor at a newline matches. */ + unsigned newline_anchor : 1; + +@end example + + +@node GNU Regular Expression Compiling, GNU Matching, GNU Pattern Buffers, GNU Regex Functions +@subsection GNU Regular Expression Compiling + +In @sc{gnu}, you can both match and search for a given regular +expression. To do either, you must first compile it in a pattern buffer +(@pxref{GNU Pattern Buffers}). + +@cindex syntax initialization +@vindex re_syntax_options @r{initialization} +Regular expressions match according to the syntax with which they were +compiled; with @sc{gnu}, you indicate what syntax you want by setting +the variable @code{re_syntax_options} (declared in @file{regex.h} and +defined in @file{regex.c}) before calling the compiling function, +@code{re_compile_pattern} (see below). @xref{Syntax Bits}, and +@ref{Predefined Syntaxes}. + +You can change the value of @code{re_syntax_options} at any time. +Usually, however, you set its value once and then never change it. + +@cindex pattern buffer initialization +@code{re_compile_pattern} takes a pattern buffer as an argument. You +must initialize the following fields: + +@table @code + +@item translate @r{initialization} + +@item translate +@vindex translate @r{initialization} +Initialize this to point to a translate table if you want one, or to +zero if you don't. We explain translate tables in @ref{GNU Translate +Tables}. + +@item fastmap +@vindex fastmap @r{initialization} +Initialize this to nonzero if you want a fastmap, or to zero if you +don't. + +@item buffer +@itemx allocated +@vindex buffer @r{initialization} +@vindex allocated @r{initialization} +@findex malloc +If you want @code{re_compile_pattern} to allocate memory for the +compiled pattern, set both of these to zero. If you have an existing +block of memory (allocated with @code{malloc}) you want Regex to use, +set @code{buffer} to its address and @code{allocated} to its size (in +bytes). + +@code{re_compile_pattern} uses @code{realloc} to extend the space for +the compiled pattern as necessary. + +@end table + +To compile a pattern buffer, use: + +@findex re_compile_pattern +@example +char * +re_compile_pattern (const char *@var{regex}, const int @var{regex_size}, + struct re_pattern_buffer *@var{pattern_buffer}) +@end example + +@noindent +@var{regex} is the regular expression's address, @var{regex_size} is its +length, and @var{pattern_buffer} is the pattern buffer's address. + +If @code{re_compile_pattern} successfully compiles the regular +expression, it returns zero and sets @code{*@var{pattern_buffer}} to the +compiled pattern. It sets the pattern buffer's fields as follows: + +@table @code +@item buffer +@vindex buffer @r{field, set by @code{re_compile_pattern}} +to the compiled pattern. + +@item used +@vindex used @r{field, set by @code{re_compile_pattern}} +to the number of bytes the compiled pattern in @code{buffer} occupies. + +@item syntax +@vindex syntax @r{field, set by @code{re_compile_pattern}} +to the current value of @code{re_syntax_options}. + +@item re_nsub +@vindex re_nsub @r{field, set by @code{re_compile_pattern}} +to the number of subexpressions in @var{regex}. + +@item fastmap_accurate +@vindex fastmap_accurate @r{field, set by @code{re_compile_pattern}} +to zero on the theory that the pattern you're compiling is different +than the one previously compiled into @code{buffer}; in that case (since +you can't make a fastmap without a compiled pattern), +@code{fastmap} would either contain an incompatible fastmap, or nothing +at all. + +@c xx what else? +@end table + +If @code{re_compile_pattern} can't compile @var{regex}, it returns an +error string corresponding to one of the errors listed in @ref{POSIX +Regular Expression Compiling}. + + +@node GNU Matching, GNU Searching, GNU Regular Expression Compiling, GNU Regex Functions +@subsection GNU Matching + +@cindex matching with GNU functions + +Matching the @sc{gnu} way means trying to match as much of a string as +possible starting at a position within it you specify. Once you've compiled +a pattern into a pattern buffer (@pxref{GNU Regular Expression +Compiling}), you can ask the matcher to match that pattern against a +string using: + +@findex re_match +@example +int +re_match (struct re_pattern_buffer *@var{pattern_buffer}, + const char *@var{string}, const int @var{size}, + const int @var{start}, struct re_registers *@var{regs}) +@end example + +@noindent +@var{pattern_buffer} is the address of a pattern buffer containing a +compiled pattern. @var{string} is the string you want to match; it can +contain newline and null characters. @var{size} is the length of that +string. @var{start} is the string index at which you want to +begin matching; the first character of @var{string} is at index zero. +@xref{Using Registers}, for a explanation of @var{regs}; you can safely +pass zero. + +@code{re_match} matches the regular expression in @var{pattern_buffer} +against the string @var{string} according to the syntax in +@var{pattern_buffers}'s @code{syntax} field. (@xref{GNU Regular +Expression Compiling}, for how to set it.) The function returns +@math{-1} if the compiled pattern does not match any part of +@var{string} and @math{-2} if an internal error happens; otherwise, it +returns how many (possibly zero) characters of @var{string} the pattern +matched. + +An example: suppose @var{pattern_buffer} points to a pattern buffer +containing the compiled pattern for @samp{a*}, and @var{string} points +to @samp{aaaaab} (whereupon @var{size} should be 6). Then if @var{start} +is 2, @code{re_match} returns 3, i.e., @samp{a*} would have matched the +last three @samp{a}s in @var{string}. If @var{start} is 0, +@code{re_match} returns 5, i.e., @samp{a*} would have matched all the +@samp{a}s in @var{string}. If @var{start} is either 5 or 6, it returns +zero. + +If @var{start} is not between zero and @var{size}, then +@code{re_match} returns @math{-1}. + + +@node GNU Searching, Matching/Searching with Split Data, GNU Matching, GNU Regex Functions +@subsection GNU Searching + +@cindex searching with GNU functions + +@dfn{Searching} means trying to match starting at successive positions +within a string. The function @code{re_search} does this. + +Before calling @code{re_search}, you must compile your regular +expression. @xref{GNU Regular Expression Compiling}. + +Here is the function declaration: + +@findex re_search +@example +int +re_search (struct re_pattern_buffer *@var{pattern_buffer}, + const char *@var{string}, const int @var{size}, + const int @var{start}, const int @var{range}, + struct re_registers *@var{regs}) +@end example + +@noindent +@vindex start @r{argument to @code{re_search}} +@vindex range @r{argument to @code{re_search}} +whose arguments are the same as those to @code{re_match} (@pxref{GNU +Matching}) except that the two arguments @var{start} and @var{range} +replace @code{re_match}'s argument @var{start}. + +If @var{range} is positive, then @code{re_search} attempts a match +starting first at index @var{start}, then at @math{@var{start} + 1} if +that fails, and so on, up to @math{@var{start} + @var{range}}; if +@var{range} is negative, then it attempts a match starting first at +index @var{start}, then at @math{@var{start} -1} if that fails, and so +on. + +If @var{start} is not between zero and @var{size}, then @code{re_search} +returns @math{-1}. When @var{range} is positive, @code{re_search} +adjusts @var{range} so that @math{@var{start} + @var{range} - 1} is +between zero and @var{size}, if necessary; that way it won't search +outside of @var{string}. Similarly, when @var{range} is negative, +@code{re_search} adjusts @var{range} so that @math{@var{start} + +@var{range} + 1} is between zero and @var{size}, if necessary. + +If the @code{fastmap} field of @var{pattern_buffer} is zero, +@code{re_search} matches starting at consecutive positions; otherwise, +it uses @code{fastmap} to make the search more efficient. +@xref{Searching with Fastmaps}. + +If no match is found, @code{re_search} returns @math{-1}. If +a match is found, it returns the index where the match began. If an +internal error happens, it returns @math{-2}. + + +@node Matching/Searching with Split Data, Searching with Fastmaps, GNU Searching, GNU Regex Functions +@subsection Matching and Searching with Split Data + +Using the functions @code{re_match_2} and @code{re_search_2}, you can +match or search in data that is divided into two strings. + +The function: + +@findex re_match_2 +@example +int +re_match_2 (struct re_pattern_buffer *@var{buffer}, + const char *@var{string1}, const int @var{size1}, + const char *@var{string2}, const int @var{size2}, + const int @var{start}, + struct re_registers *@var{regs}, + const int @var{stop}) +@end example + +@noindent +is similar to @code{re_match} (@pxref{GNU Matching}) except that you +pass @emph{two} data strings and sizes, and an index @var{stop} beyond +which you don't want the matcher to try matching. As with +@code{re_match}, if it succeeds, @code{re_match_2} returns how many +characters of @var{string} it matched. Regard @var{string1} and +@var{string2} as concatenated when you set the arguments @var{start} and +@var{stop} and use the contents of @var{regs}; @code{re_match_2} never +returns a value larger than @math{@var{size1} + @var{size2}}. + +The function: + +@findex re_search_2 +@example +int +re_search_2 (struct re_pattern_buffer *@var{buffer}, + const char *@var{string1}, const int @var{size1}, + const char *@var{string2}, const int @var{size2}, + const int @var{start}, const int @var{range}, + struct re_registers *@var{regs}, + const int @var{stop}) +@end example + +@noindent +is similarly related to @code{re_search}. + + +@node Searching with Fastmaps, GNU Translate Tables, Matching/Searching with Split Data, GNU Regex Functions +@subsection Searching with Fastmaps + +@cindex fastmaps +If you're searching through a long string, you should use a fastmap. +Without one, the searcher tries to match at consecutive positions in the +string. Generally, most of the characters in the string could not start +a match. It takes much longer to try matching at a given position in the +string than it does to check in a table whether or not the character at +that position could start a match. A @dfn{fastmap} is such a table. + +More specifically, a fastmap is an array indexed by the characters in +your character set. Under the @sc{ascii} encoding, therefore, a fastmap +has 256 elements. If you want the searcher to use a fastmap with a +given pattern buffer, you must allocate the array and assign the array's +address to the pattern buffer's @code{fastmap} field. You either can +compile the fastmap yourself or have @code{re_search} do it for you; +when @code{fastmap} is nonzero, it automatically compiles a fastmap the +first time you search using a particular compiled pattern. + +To compile a fastmap yourself, use: + +@findex re_compile_fastmap +@example +int +re_compile_fastmap (struct re_pattern_buffer *@var{pattern_buffer}) +@end example + +@noindent +@var{pattern_buffer} is the address of a pattern buffer. If the +character @var{c} could start a match for the pattern, +@code{re_compile_fastmap} makes +@code{@var{pattern_buffer}->fastmap[@var{c}]} nonzero. It returns +@math{0} if it can compile a fastmap and @math{-2} if there is an +internal error. For example, if @samp{|} is the alternation operator +and @var{pattern_buffer} holds the compiled pattern for @samp{a|b}, then +@code{re_compile_fastmap} sets @code{fastmap['a']} and +@code{fastmap['b']} (and no others). + +@code{re_search} uses a fastmap as it moves along in the string: it +checks the string's characters until it finds one that's in the fastmap. +Then it tries matching at that character. If the match fails, it +repeats the process. So, by using a fastmap, @code{re_search} doesn't +waste time trying to match at positions in the string that couldn't +start a match. + +If you don't want @code{re_search} to use a fastmap, +store zero in the @code{fastmap} field of the pattern buffer before +calling @code{re_search}. + +Once you've initialized a pattern buffer's @code{fastmap} field, you +need never do so again---even if you compile a new pattern in +it---provided the way the field is set still reflects whether or not you +want a fastmap. @code{re_search} will still either do nothing if +@code{fastmap} is null or, if it isn't, compile a new fastmap for the +new pattern. + +@node GNU Translate Tables, Using Registers, Searching with Fastmaps, GNU Regex Functions +@subsection GNU Translate Tables + +If you set the @code{translate} field of a pattern buffer to a translate +table, then the @sc{gnu} Regex functions to which you've passed that +pattern buffer use it to apply a simple transformation +to all the regular expression and string characters at which they look. + +A @dfn{translate table} is an array indexed by the characters in your +character set. Under the @sc{ascii} encoding, therefore, a translate +table has 256 elements. The array's elements are also characters in +your character set. When the Regex functions see a character @var{c}, +they use @code{translate[@var{c}]} in its place, with one exception: the +character after a @samp{\} is not translated. (This ensures that, the +operators, e.g., @samp{\B} and @samp{\b}, are always distinguishable.) + +For example, a table that maps all lowercase letters to the +corresponding uppercase ones would cause the matcher to ignore +differences in case.@footnote{A table that maps all uppercase letters to +the corresponding lowercase ones would work just as well for this +purpose.} Such a table would map all characters except lowercase letters +to themselves, and lowercase letters to the corresponding uppercase +ones. Under the @sc{ascii} encoding, here's how you could initialize +such a table (we'll call it @code{case_fold}): + +@example +for (i = 0; i < 256; i++) + case_fold[i] = i; +for (i = 'a'; i <= 'z'; i++) + case_fold[i] = i - ('a' - 'A'); +@end example + +You tell Regex to use a translate table on a given pattern buffer by +assigning that table's address to the @code{translate} field of that +buffer. If you don't want Regex to do any translation, put zero into +this field. You'll get weird results if you change the table's contents +anytime between compiling the pattern buffer, compiling its fastmap, and +matching or searching with the pattern buffer. + +@node Using Registers, Freeing GNU Pattern Buffers, GNU Translate Tables, GNU Regex Functions +@subsection Using Registers + +A group in a regular expression can match a (posssibly empty) substring +of the string that regular expression as a whole matched. The matcher +remembers the beginning and end of the substring matched by +each group. + +To find out what they matched, pass a nonzero @var{regs} argument to a +@sc{gnu} matching or searching function (@pxref{GNU Matching} and +@ref{GNU Searching}), i.e., the address of a structure of this type, as +defined in @file{regex.h}: + +@c We don't bother to include this directly from regex.h, +@c since it changes so rarely. +@example +@tindex re_registers +@vindex num_regs @r{in @code{struct re_registers}} +@vindex start @r{in @code{struct re_registers}} +@vindex end @r{in @code{struct re_registers}} +struct re_registers +@{ + unsigned num_regs; + regoff_t *start; + regoff_t *end; +@}; +@end example + +Except for (possibly) the @var{num_regs}'th element (see below), the +@var{i}th element of the @code{start} and @code{end} arrays records +information about the @var{i}th group in the pattern. (They're declared +as C pointers, but this is only because not all C compilers accept +zero-length arrays; conceptually, it is simplest to think of them as +arrays.) + +The @code{start} and @code{end} arrays are allocated in various ways, +depending on the value of the @code{regs_allocated} +@vindex regs_allocated +field in the pattern buffer passed to the matcher. + +The simplest and perhaps most useful is to let the matcher (re)allocate +enough space to record information for all the groups in the regular +expression. If @code{regs_allocated} is @code{REGS_UNALLOCATED}, +@vindex REGS_UNALLOCATED +the matcher allocates @math{1 + @var{re_nsub}} (another field in the +pattern buffer; @pxref{GNU Pattern Buffers}). The extra element is set +to @math{-1}, and sets @code{regs_allocated} to @code{REGS_REALLOCATE}. +@vindex REGS_REALLOCATE +Then on subsequent calls with the same pattern buffer and @var{regs} +arguments, the matcher reallocates more space if necessary. + +It would perhaps be more logical to make the @code{regs_allocated} field +part of the @code{re_registers} structure, instead of part of the +pattern buffer. But in that case the caller would be forced to +initialize the structure before passing it. Much existing code doesn't +do this initialization, and it's arguably better to avoid it anyway. + +@code{re_compile_pattern} sets @code{regs_allocated} to +@code{REGS_UNALLOCATED}, +so if you use the GNU regular expression +functions, you get this behavior by default. + +xx document re_set_registers + +@sc{posix}, on the other hand, requires a different interface: the +caller is supposed to pass in a fixed-length array which the matcher +fills. Therefore, if @code{regs_allocated} is @code{REGS_FIXED} +@vindex REGS_FIXED +the matcher simply fills that array. + +The following examples illustrate the information recorded in the +@code{re_registers} structure. (In all of them, @samp{(} represents the +open-group and @samp{)} the close-group operator. The first character +in the string @var{string} is at index 0.) + +@c xx i'm not sure this is all true anymore. + +@itemize @bullet + +@item +If the regular expression has an @w{@var{i}-th} +group not contained within another group that matches a +substring of @var{string}, then the function sets +@code{@w{@var{regs}->}start[@var{i}]} to the index in @var{string} where +the substring matched by the @w{@var{i}-th} group begins, and +@code{@w{@var{regs}->}end[@var{i}]} to the index just beyond that +substring's end. The function sets @code{@w{@var{regs}->}start[0]} and +@code{@w{@var{regs}->}end[0]} to analogous information about the entire +pattern. + +For example, when you match @samp{((a)(b))} against @samp{ab}, you get: + +@itemize +@item +0 in @code{@w{@var{regs}->}start[0]} and 2 in @code{@w{@var{regs}->}end[0]} + +@item +0 in @code{@w{@var{regs}->}start[1]} and 2 in @code{@w{@var{regs}->}end[1]} + +@item +0 in @code{@w{@var{regs}->}start[2]} and 1 in @code{@w{@var{regs}->}end[2]} + +@item +1 in @code{@w{@var{regs}->}start[3]} and 2 in @code{@w{@var{regs}->}end[3]} +@end itemize + +@item +If a group matches more than once (as it might if followed by, +e.g., a repetition operator), then the function reports the information +about what the group @emph{last} matched. + +For example, when you match the pattern @samp{(a)*} against the string +@samp{aa}, you get: + +@itemize +@item +0 in @code{@w{@var{regs}->}start[0]} and 2 in @code{@w{@var{regs}->}end[0]} + +@item +1 in @code{@w{@var{regs}->}start[1]} and 2 in @code{@w{@var{regs}->}end[1]} +@end itemize + +@item +If the @w{@var{i}-th} group does not participate in a +successful match, e.g., it is an alternative not taken or a +repetition operator allows zero repetitions of it, then the function +sets @code{@w{@var{regs}->}start[@var{i}]} and +@code{@w{@var{regs}->}end[@var{i}]} to @math{-1}. + +For example, when you match the pattern @samp{(a)*b} against +the string @samp{b}, you get: + +@itemize +@item +0 in @code{@w{@var{regs}->}start[0]} and 1 in @code{@w{@var{regs}->}end[0]} + +@item +@math{-1} in @code{@w{@var{regs}->}start[1]} and @math{-1} in @code{@w{@var{regs}->}end[1]} +@end itemize + +@item +If the @w{@var{i}-th} group matches a zero-length string, then the +function sets @code{@w{@var{regs}->}start[@var{i}]} and +@code{@w{@var{regs}->}end[@var{i}]} to the index just beyond that +zero-length string. + +For example, when you match the pattern @samp{(a*)b} against the string +@samp{b}, you get: + +@itemize +@item +0 in @code{@w{@var{regs}->}start[0]} and 1 in @code{@w{@var{regs}->}end[0]} + +@item +0 in @code{@w{@var{regs}->}start[1]} and 0 in @code{@w{@var{regs}->}end[1]} +@end itemize + +@ignore +The function sets @code{@w{@var{regs}->}start[0]} and +@code{@w{@var{regs}->}end[0]} to analogous information about the entire +pattern. + +For example, when you match the pattern @samp{(a*)} against the empty +string, you get: + +@itemize +@item +0 in @code{@w{@var{regs}->}start[0]} and 0 in @code{@w{@var{regs}->}end[0]} + +@item +0 in @code{@w{@var{regs}->}start[1]} and 0 in @code{@w{@var{regs}->}end[1]} +@end itemize +@end ignore + +@item +If an @w{@var{i}-th} group contains a @w{@var{j}-th} group +in turn not contained within any other group within group @var{i} and +the function reports a match of the @w{@var{i}-th} group, then it +records in @code{@w{@var{regs}->}start[@var{j}]} and +@code{@w{@var{regs}->}end[@var{j}]} the last match (if it matched) of +the @w{@var{j}-th} group. + +For example, when you match the pattern @samp{((a*)b)*} against the +string @samp{abb}, @w{group 2} last matches the empty string, so you +get what it previously matched: + +@itemize +@item +0 in @code{@w{@var{regs}->}start[0]} and 3 in @code{@w{@var{regs}->}end[0]} + +@item +2 in @code{@w{@var{regs}->}start[1]} and 3 in @code{@w{@var{regs}->}end[1]} + +@item +2 in @code{@w{@var{regs}->}start[2]} and 2 in @code{@w{@var{regs}->}end[2]} +@end itemize + +When you match the pattern @samp{((a)*b)*} against the string +@samp{abb}, @w{group 2} doesn't participate in the last match, so you +get: + +@itemize +@item +0 in @code{@w{@var{regs}->}start[0]} and 3 in @code{@w{@var{regs}->}end[0]} + +@item +2 in @code{@w{@var{regs}->}start[1]} and 3 in @code{@w{@var{regs}->}end[1]} + +@item +0 in @code{@w{@var{regs}->}start[2]} and 1 in @code{@w{@var{regs}->}end[2]} +@end itemize + +@item +If an @w{@var{i}-th} group contains a @w{@var{j}-th} group +in turn not contained within any other group within group @var{i} +and the function sets +@code{@w{@var{regs}->}start[@var{i}]} and +@code{@w{@var{regs}->}end[@var{i}]} to @math{-1}, then it also sets +@code{@w{@var{regs}->}start[@var{j}]} and +@code{@w{@var{regs}->}end[@var{j}]} to @math{-1}. + +For example, when you match the pattern @samp{((a)*b)*c} against the +string @samp{c}, you get: + +@itemize +@item +0 in @code{@w{@var{regs}->}start[0]} and 1 in @code{@w{@var{regs}->}end[0]} + +@item +@math{-1} in @code{@w{@var{regs}->}start[1]} and @math{-1} in @code{@w{@var{regs}->}end[1]} + +@item +@math{-1} in @code{@w{@var{regs}->}start[2]} and @math{-1} in @code{@w{@var{regs}->}end[2]} +@end itemize + +@end itemize + +@node Freeing GNU Pattern Buffers, , Using Registers, GNU Regex Functions +@subsection Freeing GNU Pattern Buffers + +To free any allocated fields of a pattern buffer, you can use the +@sc{posix} function described in @ref{Freeing POSIX Pattern Buffers}, +since the type @code{regex_t}---the type for @sc{posix} pattern +buffers---is equivalent to the type @code{re_pattern_buffer}. After +freeing a pattern buffer, you need to again compile a regular expression +in it (@pxref{GNU Regular Expression Compiling}) before passing it to +a matching or searching function. + + +@node POSIX Regex Functions, BSD Regex Functions, GNU Regex Functions, Programming with Regex +@section POSIX Regex Functions + +If you're writing code that has to be @sc{posix} compatible, you'll need +to use these functions. Their interfaces are as specified by @sc{posix}, +draft 1003.2/D11.2. + +@menu +* POSIX Pattern Buffers:: The regex_t type. +* POSIX Regular Expression Compiling:: regcomp () +* POSIX Matching:: regexec () +* Reporting Errors:: regerror () +* Using Byte Offsets:: The regmatch_t type. +* Freeing POSIX Pattern Buffers:: regfree () +@end menu + + +@node POSIX Pattern Buffers, POSIX Regular Expression Compiling, , POSIX Regex Functions +@subsection POSIX Pattern Buffers + +To compile or match a given regular expression the @sc{posix} way, you +must supply a pattern buffer exactly the way you do for @sc{gnu} +(@pxref{GNU Pattern Buffers}). @sc{posix} pattern buffers have type +@code{regex_t}, which is equivalent to the @sc{gnu} pattern buffer +type @code{re_pattern_buffer}. + + +@node POSIX Regular Expression Compiling, POSIX Matching, POSIX Pattern Buffers, POSIX Regex Functions +@subsection POSIX Regular Expression Compiling + +With @sc{posix}, you can only search for a given regular expression; you +can't match it. To do this, you must first compile it in a +pattern buffer, using @code{regcomp}. + +@ignore +Before calling @code{regcomp}, you must initialize this pattern buffer +as you do for @sc{gnu} (@pxref{GNU Regular Expression Compiling}). See +below, however, for how to choose a syntax with which to compile. +@end ignore + +To compile a pattern buffer, use: + +@findex regcomp +@example +int +regcomp (regex_t *@var{preg}, const char *@var{regex}, int @var{cflags}) +@end example + +@noindent +@var{preg} is the initialized pattern buffer's address, @var{regex} is +the regular expression's address, and @var{cflags} is the compilation +flags, which Regex considers as a collection of bits. Here are the +valid bits, as defined in @file{regex.h}: + +@table @code + +@item REG_EXTENDED +@vindex REG_EXTENDED +says to use @sc{posix} Extended Regular Expression syntax; if this isn't +set, then says to use @sc{posix} Basic Regular Expression syntax. +@code{regcomp} sets @var{preg}'s @code{syntax} field accordingly. + +@item REG_ICASE +@vindex REG_ICASE +@cindex ignoring case +says to ignore case; @code{regcomp} sets @var{preg}'s @code{translate} +field to a translate table which ignores case, replacing anything you've +put there before. + +@item REG_NOSUB +@vindex REG_NOSUB +says to set @var{preg}'s @code{no_sub} field; @pxref{POSIX Matching}, +for what this means. + +@item REG_NEWLINE +@vindex REG_NEWLINE +says that a: + +@itemize @bullet + +@item +match-any-character operator (@pxref{Match-any-character +Operator}) doesn't match a newline. + +@item +nonmatching list not containing a newline (@pxref{List +Operators}) matches a newline. + +@item +match-beginning-of-line operator (@pxref{Match-beginning-of-line +Operator}) matches the empty string immediately after a newline, +regardless of how @code{REG_NOTBOL} is set (@pxref{POSIX Matching}, for +an explanation of @code{REG_NOTBOL}). + +@item +match-end-of-line operator (@pxref{Match-beginning-of-line +Operator}) matches the empty string immediately before a newline, +regardless of how @code{REG_NOTEOL} is set (@pxref{POSIX Matching}, +for an explanation of @code{REG_NOTEOL}). + +@end itemize + +@end table + +If @code{regcomp} successfully compiles the regular expression, it +returns zero and sets @code{*@var{pattern_buffer}} to the compiled +pattern. Except for @code{syntax} (which it sets as explained above), it +also sets the same fields the same way as does the @sc{gnu} compiling +function (@pxref{GNU Regular Expression Compiling}). + +If @code{regcomp} can't compile the regular expression, it returns one +of the error codes listed here. (Except when noted differently, the +syntax of in all examples below is basic regular expression syntax.) + +@table @code + +@comment repetitions +@item REG_BADRPT +For example, the consecutive repetition operators @samp{**} in +@samp{a**} are invalid. As another example, if the syntax is extended +regular expression syntax, then the repetition operator @samp{*} with +nothing on which to operate in @samp{*} is invalid. + +@item REG_BADBR +For example, the @var{count} @samp{-1} in @samp{a\@{-1} is invalid. + +@item REG_EBRACE +For example, @samp{a\@{1} is missing a close-interval operator. + +@comment lists +@item REG_EBRACK +For example, @samp{[a} is missing a close-list operator. + +@item REG_ERANGE +For example, the range ending point @samp{z} that collates lower than +does its starting point @samp{a} in @samp{[z-a]} is invalid. Also, the +range with the character class @samp{[:alpha:]} as its starting point in +@samp{[[:alpha:]-|]}. + +@item REG_ECTYPE +For example, the character class name @samp{foo} in @samp{[[:foo:]} is +invalid. + +@comment groups +@item REG_EPAREN +For example, @samp{a\)} is missing an open-group operator and @samp{\(a} +is missing a close-group operator. + +@item REG_ESUBREG +For example, the back reference @samp{\2} that refers to a nonexistent +subexpression in @samp{\(a\)\2} is invalid. + +@comment unfinished business + +@item REG_EEND +Returned when a regular expression causes no other more specific error. + +@item REG_EESCAPE +For example, the trailing backslash @samp{\} in @samp{a\} is invalid, as is the +one in @samp{\}. + +@comment kitchen sink +@item REG_BADPAT +For example, in the extended regular expression syntax, the empty group +@samp{()} in @samp{a()b} is invalid. + +@comment internal +@item REG_ESIZE +Returned when a regular expression needs a pattern buffer larger than +65536 bytes. + +@item REG_ESPACE +Returned when a regular expression makes Regex to run out of memory. + +@end table + + +@node POSIX Matching, Reporting Errors, POSIX Regular Expression Compiling, POSIX Regex Functions +@subsection POSIX Matching + +Matching the @sc{posix} way means trying to match a null-terminated +string starting at its first character. Once you've compiled a pattern +into a pattern buffer (@pxref{POSIX Regular Expression Compiling}), you +can ask the matcher to match that pattern against a string using: + +@findex regexec +@example +int +regexec (const regex_t *@var{preg}, const char *@var{string}, + size_t @var{nmatch}, regmatch_t @var{pmatch}[], int @var{eflags}) +@end example + +@noindent +@var{preg} is the address of a pattern buffer for a compiled pattern. +@var{string} is the string you want to match. + +@xref{Using Byte Offsets}, for an explanation of @var{pmatch}. If you +pass zero for @var{nmatch} or you compiled @var{preg} with the +compilation flag @code{REG_NOSUB} set, then @code{regexec} will ignore +@var{pmatch}; otherwise, you must allocate it to have at least +@var{nmatch} elements. @code{regexec} will record @var{nmatch} byte +offsets in @var{pmatch}, and set to @math{-1} any unused elements up to +@math{@var{pmatch}@code{[@var{nmatch}]} - 1}. + +@var{eflags} specifies @dfn{execution flags}---namely, the two bits +@code{REG_NOTBOL} and @code{REG_NOTEOL} (defined in @file{regex.h}). If +you set @code{REG_NOTBOL}, then the match-beginning-of-line operator +(@pxref{Match-beginning-of-line Operator}) always fails to match. +This lets you match against pieces of a line, as you would need to if, +say, searching for repeated instances of a given pattern in a line; it +would work correctly for patterns both with and without +match-beginning-of-line operators. @code{REG_NOTEOL} works analogously +for the match-end-of-line operator (@pxref{Match-end-of-line +Operator}); it exists for symmetry. + +@code{regexec} tries to find a match for @var{preg} in @var{string} +according to the syntax in @var{preg}'s @code{syntax} field. +(@xref{POSIX Regular Expression Compiling}, for how to set it.) The +function returns zero if the compiled pattern matches @var{string} and +@code{REG_NOMATCH} (defined in @file{regex.h}) if it doesn't. + +@node Reporting Errors, Using Byte Offsets, POSIX Matching, POSIX Regex Functions +@subsection Reporting Errors + +If either @code{regcomp} or @code{regexec} fail, they return a nonzero +error code, the possibilities for which are defined in @file{regex.h}. +@xref{POSIX Regular Expression Compiling}, and @ref{POSIX Matching}, for +what these codes mean. To get an error string corresponding to these +codes, you can use: + +@findex regerror +@example +size_t +regerror (int @var{errcode}, + const regex_t *@var{preg}, + char *@var{errbuf}, + size_t @var{errbuf_size}) +@end example + +@noindent +@var{errcode} is an error code, @var{preg} is the address of the pattern +buffer which provoked the error, @var{errbuf} is the error buffer, and +@var{errbuf_size} is @var{errbuf}'s size. + +@code{regerror} returns the size in bytes of the error string +corresponding to @var{errcode} (including its terminating null). If +@var{errbuf} and @var{errbuf_size} are nonzero, it also returns in +@var{errbuf} the first @math{@var{errbuf_size} - 1} characters of the +error string, followed by a null. +@var{errbuf_size} must be a nonnegative number less than or equal to the +size in bytes of @var{errbuf}. + +You can call @code{regerror} with a null @var{errbuf} and a zero +@var{errbuf_size} to determine how large @var{errbuf} need be to +accommodate @code{regerror}'s error string. + +@node Using Byte Offsets, Freeing POSIX Pattern Buffers, Reporting Errors, POSIX Regex Functions +@subsection Using Byte Offsets + +In @sc{posix}, variables of type @code{regmatch_t} hold analogous +information, but are not identical to, @sc{gnu}'s registers (@pxref{Using +Registers}). To get information about registers in @sc{posix}, pass to +@code{regexec} a nonzero @var{pmatch} of type @code{regmatch_t}, i.e., +the address of a structure of this type, defined in +@file{regex.h}: + +@tindex regmatch_t +@example +typedef struct +@{ + regoff_t rm_so; + regoff_t rm_eo; +@} regmatch_t; +@end example + +When reading in @ref{Using Registers}, about how the matching function +stores the information into the registers, substitute @var{pmatch} for +@var{regs}, @code{@w{@var{pmatch}[@var{i}]->}rm_so} for +@code{@w{@var{regs}->}start[@var{i}]} and +@code{@w{@var{pmatch}[@var{i}]->}rm_eo} for +@code{@w{@var{regs}->}end[@var{i}]}. + +@node Freeing POSIX Pattern Buffers, , Using Byte Offsets, POSIX Regex Functions +@subsection Freeing POSIX Pattern Buffers + +To free any allocated fields of a pattern buffer, use: + +@findex regfree +@example +void +regfree (regex_t *@var{preg}) +@end example + +@noindent +@var{preg} is the pattern buffer whose allocated fields you want freed. +@code{regfree} also sets @var{preg}'s @code{allocated} and @code{used} +fields to zero. After freeing a pattern buffer, you need to again +compile a regular expression in it (@pxref{POSIX Regular Expression +Compiling}) before passing it to the matching function (@pxref{POSIX +Matching}). + + +@node BSD Regex Functions, , POSIX Regex Functions, Programming with Regex +@section BSD Regex Functions + +If you're writing code that has to be Berkeley @sc{unix} compatible, +you'll need to use these functions whose interfaces are the same as those +in Berkeley @sc{unix}. + +@menu +* BSD Regular Expression Compiling:: re_comp () +* BSD Searching:: re_exec () +@end menu + +@node BSD Regular Expression Compiling, BSD Searching, , BSD Regex Functions +@subsection BSD Regular Expression Compiling + +With Berkeley @sc{unix}, you can only search for a given regular +expression; you can't match one. To search for it, you must first +compile it. Before you compile it, you must indicate the regular +expression syntax you want it compiled according to by setting the +variable @code{re_syntax_options} (declared in @file{regex.h} to some +syntax (@pxref{Regular Expression Syntax}). + +To compile a regular expression use: + +@findex re_comp +@example +char * +re_comp (char *@var{regex}) +@end example + +@noindent +@var{regex} is the address of a null-terminated regular expression. +@code{re_comp} uses an internal pattern buffer, so you can use only the +most recently compiled pattern buffer. This means that if you want to +use a given regular expression that you've already compiled---but it +isn't the latest one you've compiled---you'll have to recompile it. If +you call @code{re_comp} with the null string (@emph{not} the empty +string) as the argument, it doesn't change the contents of the pattern +buffer. + +If @code{re_comp} successfully compiles the regular expression, it +returns zero. If it can't compile the regular expression, it returns +an error string. @code{re_comp}'s error messages are identical to those +of @code{re_compile_pattern} (@pxref{GNU Regular Expression +Compiling}). + +@node BSD Searching, , BSD Regular Expression Compiling, BSD Regex Functions +@subsection BSD Searching + +Searching the Berkeley @sc{unix} way means searching in a string +starting at its first character and trying successive positions within +it to find a match. Once you've compiled a pattern using @code{re_comp} +(@pxref{BSD Regular Expression Compiling}), you can ask Regex +to search for that pattern in a string using: + +@findex re_exec +@example +int +re_exec (char *@var{string}) +@end example + +@noindent +@var{string} is the address of the null-terminated string in which you +want to search. + +@code{re_exec} returns either 1 for success or 0 for failure. It +automatically uses a @sc{gnu} fastmap (@pxref{Searching with Fastmaps}). + + +@node Copying, Index, Programming with Regex, Top +@appendix GNU GENERAL PUBLIC LICENSE +@center Version 2, June 1991 + +@display +Copyright @copyright{} 1989, 1991 Free Software Foundation, Inc. +675 Mass Ave, Cambridge, MA 02139, USA + +Everyone is permitted to copy and distribute verbatim copies +of this license document, but changing it is not allowed. +@end display + +@unnumberedsec Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software---to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + +@iftex +@unnumberedsec TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION +@end iftex +@ifinfo +@center TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION +@end ifinfo + +@enumerate +@item +This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The ``Program'', below, +refers to any such program or work, and a ``work based on the Program'' +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term ``modification''.) Each licensee is addressed as ``you''. + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + +@item +You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + +@item +You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + +@enumerate a +@item +You must cause the modified files to carry prominent notices +stating that you changed the files and the date of any change. + +@item +You must cause any work that you distribute or publish, that in +whole or in part contains or is derived from the Program or any +part thereof, to be licensed as a whole at no charge to all third +parties under the terms of this License. + +@item +If the modified program normally reads commands interactively +when run, you must cause it, when started running for such +interactive use in the most ordinary way, to print or display an +announcement including an appropriate copyright notice and a +notice that there is no warranty (or else, saying that you provide +a warranty) and that users may redistribute the program under +these conditions, and telling the user how to view a copy of this +License. (Exception: if the Program itself is interactive but +does not normally print such an announcement, your work based on +the Program is not required to print an announcement.) +@end enumerate + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + +@item +You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + +@enumerate a +@item +Accompany it with the complete corresponding machine-readable +source code, which must be distributed under the terms of Sections +1 and 2 above on a medium customarily used for software interchange; or, + +@item +Accompany it with a written offer, valid for at least three +years, to give any third party, for a charge no more than your +cost of physically performing source distribution, a complete +machine-readable copy of the corresponding source code, to be +distributed under the terms of Sections 1 and 2 above on a medium +customarily used for software interchange; or, + +@item +Accompany it with the information you received as to the offer +to distribute corresponding source code. (This alternative is +allowed only for noncommercial distribution and only if you +received the program in object code or executable form with such +an offer, in accord with Subsection b above.) +@end enumerate + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + +@item +You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + +@item +You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + +@item +Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + +@item +If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + +@item +If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + +@item +The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and ``any +later version'', you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + +@item +If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + +@iftex +@heading NO WARRANTY +@end iftex +@ifinfo +@center NO WARRANTY +@end ifinfo + +@item +BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM ``AS IS'' WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + +@item +IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. +@end enumerate + +@iftex +@heading END OF TERMS AND CONDITIONS +@end iftex +@ifinfo +@center END OF TERMS AND CONDITIONS +@end ifinfo + +@page +@unnumberedsec Appendix: How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the ``copyright'' line and a pointer to where the full notice is found. + +@smallexample +@var{one line to give the program's name and a brief idea of what it does.} +Copyright (C) 19@var{yy} @var{name of author} + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +@end smallexample + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + +@smallexample +Gnomovision version 69, Copyright (C) 19@var{yy} @var{name of author} +Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. +This is free software, and you are welcome to redistribute it +under certain conditions; type `show c' for details. +@end smallexample + +The hypothetical commands @samp{show w} and @samp{show c} should show +the appropriate parts of the General Public License. Of course, the +commands you use may be called something other than @samp{show w} and +@samp{show c}; they could even be mouse-clicks or menu items---whatever +suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a ``copyright disclaimer'' for the program, if +necessary. Here is a sample; alter the names: + +@example +Yoyodyne, Inc., hereby disclaims all copyright interest in the program +`Gnomovision' (which makes passes at compilers) written by James Hacker. + +@var{signature of Ty Coon}, 1 April 1989 +Ty Coon, President of Vice +@end example + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. + + +@node Index, , Copying, Top +@unnumbered Index + +@printindex cp + +@contents + +@bye diff --git a/regex-0.12/doc/texinfo.tex b/regex-0.12/doc/texinfo.tex @@ -0,0 +1,3941 @@ +%% TeX macros to handle texinfo files + +% Copyright (C) 1985, 86, 88, 90, 91, 92, 1993 Free Software Foundation, Inc. + +%This texinfo.tex file is free software; you can redistribute it and/or +%modify it under the terms of the GNU General Public License as +%published by the Free Software Foundation; either version 2, or (at +%your option) any later version. + +%This texinfo.tex file is distributed in the hope that it will be +%useful, but WITHOUT ANY WARRANTY; without even the implied warranty +%of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +%General Public License for more details. + +%You should have received a copy of the GNU General Public License +%along with this texinfo.tex file; see the file COPYING. If not, write +%to the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, +%USA. + + +%In other words, you are welcome to use, share and improve this program. +%You are forbidden to forbid anyone else to use, share and improve +%what you give them. Help stamp out software-hoarding! + +\def\texinfoversion{2.104} +\message{Loading texinfo package [Version \texinfoversion]:} +\message{} + +% Print the version number if in a .fmt file. +\everyjob{\message{[Texinfo version \texinfoversion]}\message{}} + +% Save some parts of plain tex whose names we will redefine. + +\let\ptexlbrace=\{ +\let\ptexrbrace=\} +\let\ptexdots=\dots +\let\ptexdot=\. +\let\ptexstar=\* +\let\ptexend=\end +\let\ptexbullet=\bullet +\let\ptexb=\b +\let\ptexc=\c +\let\ptexi=\i +\let\ptext=\t +\let\ptexl=\l +\let\ptexL=\L + +\def\tie{\penalty 10000\ } % Save plain tex definition of ~. + +\message{Basics,} +\chardef\other=12 + +% If this character appears in an error message or help string, it +% starts a new line in the output. +\newlinechar = `^^J + +% Ignore a token. +% +\def\gobble#1{} + +\hyphenation{ap-pen-dix} +\hyphenation{mini-buf-fer mini-buf-fers} +\hyphenation{eshell} + +% Margin to add to right of even pages, to left of odd pages. +\newdimen \bindingoffset \bindingoffset=0pt +\newdimen \normaloffset \normaloffset=\hoffset +\newdimen\pagewidth \newdimen\pageheight +\pagewidth=\hsize \pageheight=\vsize + +% Sometimes it is convenient to have everything in the transcript file +% and nothing on the terminal. We don't just call \tracingall here, +% since that produces some useless output on the terminal. +% +\def\gloggingall{\begingroup \globaldefs = 1 \loggingall \endgroup}% +\def\loggingall{\tracingcommands2 \tracingstats2 + \tracingpages1 \tracingoutput1 \tracinglostchars1 + \tracingmacros2 \tracingparagraphs1 \tracingrestores1 + \showboxbreadth\maxdimen\showboxdepth\maxdimen +}% + +%---------------------Begin change----------------------- +% +%%%% For @cropmarks command. +% Dimensions to add cropmarks at corners Added by P. A. MacKay, 12 Nov. 1986 +% +\newdimen\cornerlong \newdimen\cornerthick +\newdimen \topandbottommargin +\newdimen \outerhsize \newdimen \outervsize +\cornerlong=1pc\cornerthick=.3pt % These set size of cropmarks +\outerhsize=7in +%\outervsize=9.5in +% Alternative @smallbook page size is 9.25in +\outervsize=9.25in +\topandbottommargin=.75in +% +%---------------------End change----------------------- + +% \onepageout takes a vbox as an argument. Note that \pagecontents +% does insertions itself, but you have to call it yourself. +\chardef\PAGE=255 \output={\onepageout{\pagecontents\PAGE}} +\def\onepageout#1{\hoffset=\normaloffset +\ifodd\pageno \advance\hoffset by \bindingoffset +\else \advance\hoffset by -\bindingoffset\fi +{\escapechar=`\\\relax % makes sure backslash is used in output files. +\shipout\vbox{{\let\hsize=\pagewidth \makeheadline} \pagebody{#1}% +{\let\hsize=\pagewidth \makefootline}}}% +\advancepageno \ifnum\outputpenalty>-20000 \else\dosupereject\fi} + +%%%% For @cropmarks command %%%% + +% Here is a modification of the main output routine for Near East Publications +% This provides right-angle cropmarks at all four corners. +% The contents of the page are centerlined into the cropmarks, +% and any desired binding offset is added as an \hskip on either +% site of the centerlined box. (P. A. MacKay, 12 November, 1986) +% +\def\croppageout#1{\hoffset=0pt % make sure this doesn't mess things up +{\escapechar=`\\\relax % makes sure backslash is used in output files. + \shipout + \vbox to \outervsize{\hsize=\outerhsize + \vbox{\line{\ewtop\hfill\ewtop}} + \nointerlineskip + \line{\vbox{\moveleft\cornerthick\nstop} + \hfill + \vbox{\moveright\cornerthick\nstop}} + \vskip \topandbottommargin + \centerline{\ifodd\pageno\hskip\bindingoffset\fi + \vbox{ + {\let\hsize=\pagewidth \makeheadline} + \pagebody{#1} + {\let\hsize=\pagewidth \makefootline}} + \ifodd\pageno\else\hskip\bindingoffset\fi} + \vskip \topandbottommargin plus1fill minus1fill + \boxmaxdepth\cornerthick + \line{\vbox{\moveleft\cornerthick\nsbot} + \hfill + \vbox{\moveright\cornerthick\nsbot}} + \nointerlineskip + \vbox{\line{\ewbot\hfill\ewbot}} + }} + \advancepageno + \ifnum\outputpenalty>-20000 \else\dosupereject\fi} +% +% Do @cropmarks to get crop marks +\def\cropmarks{\let\onepageout=\croppageout } + +\def\pagebody#1{\vbox to\pageheight{\boxmaxdepth=\maxdepth #1}} +{\catcode`\@ =11 +\gdef\pagecontents#1{\ifvoid\topins\else\unvbox\topins\fi +\dimen@=\dp#1 \unvbox#1 +\ifvoid\footins\else\vskip\skip\footins\footnoterule \unvbox\footins\fi +\ifr@ggedbottom \kern-\dimen@ \vfil \fi} +} + +% +% Here are the rules for the cropmarks. Note that they are +% offset so that the space between them is truly \outerhsize or \outervsize +% (P. A. MacKay, 12 November, 1986) +% +\def\ewtop{\vrule height\cornerthick depth0pt width\cornerlong} +\def\nstop{\vbox + {\hrule height\cornerthick depth\cornerlong width\cornerthick}} +\def\ewbot{\vrule height0pt depth\cornerthick width\cornerlong} +\def\nsbot{\vbox + {\hrule height\cornerlong depth\cornerthick width\cornerthick}} + +% Parse an argument, then pass it to #1. The argument is the rest of +% the input line (except we remove a trailing comment). #1 should be a +% macro which expects an ordinary undelimited TeX argument. +% +\def\parsearg#1{% + \let\next = #1% + \begingroup + \obeylines + \futurelet\temp\parseargx +} + +% If the next token is an obeyed space (from an @example environment or +% the like), remove it and recurse. Otherwise, we're done. +\def\parseargx{% + % \obeyedspace is defined far below, after the definition of \sepspaces. + \ifx\obeyedspace\temp + \expandafter\parseargdiscardspace + \else + \expandafter\parseargline + \fi +} + +% Remove a single space (as the delimiter token to the macro call). +{\obeyspaces % + \gdef\parseargdiscardspace {\futurelet\temp\parseargx}} + +{\obeylines % + \gdef\parseargline#1^^M{% + \endgroup % End of the group started in \parsearg. + % + % First remove any @c comment, then any @comment. + % Result of each macro is put in \toks0. + \argremovec #1\c\relax % + \expandafter\argremovecomment \the\toks0 \comment\relax % + % + % Call the caller's macro, saved as \next in \parsearg. + \expandafter\next\expandafter{\the\toks0}% + }% +} + +% Since all \c{,omment} does is throw away the argument, we can let TeX +% do that for us. The \relax here is matched by the \relax in the call +% in \parseargline; it could be more or less anything, its purpose is +% just to delimit the argument to the \c. +\def\argremovec#1\c#2\relax{\toks0 = {#1}} +\def\argremovecomment#1\comment#2\relax{\toks0 = {#1}} + +% \argremovec{,omment} might leave us with trailing spaces, though; e.g., +% @end itemize @c foo +% will have two active spaces as part of the argument with the +% `itemize'. Here we remove all active spaces from #1, and assign the +% result to \toks0. +% +% This loses if there are any *other* active characters besides spaces +% in the argument -- _ ^ +, for example -- since they get expanded. +% Fortunately, Texinfo does not define any such commands. (If it ever +% does, the catcode of the characters in questionwill have to be changed +% here.) But this means we cannot call \removeactivespaces as part of +% \argremovec{,omment}, since @c uses \parsearg, and thus the argument +% that \parsearg gets might well have any character at all in it. +% +\def\removeactivespaces#1{% + \begingroup + \ignoreactivespaces + \edef\temp{#1}% + \global\toks0 = \expandafter{\temp}% + \endgroup +} + +% Change the active space to expand to nothing. +% +\begingroup + \obeyspaces + \gdef\ignoreactivespaces{\obeyspaces\let =\empty} +\endgroup + + +\def\flushcr{\ifx\par\lisppar \def\next##1{}\else \let\next=\relax \fi \next} + +%% These are used to keep @begin/@end levels from running away +%% Call \inENV within environments (after a \begingroup) +\newif\ifENV \ENVfalse \def\inENV{\ifENV\relax\else\ENVtrue\fi} +\def\ENVcheck{% +\ifENV\errmessage{Still within an environment. Type Return to continue.} +\endgroup\fi} % This is not perfect, but it should reduce lossage + +% @begin foo is the same as @foo, for now. +\newhelp\EMsimple{Type <Return> to continue.} + +\outer\def\begin{\parsearg\beginxxx} + +\def\beginxxx #1{% +\expandafter\ifx\csname #1\endcsname\relax +{\errhelp=\EMsimple \errmessage{Undefined command @begin #1}}\else +\csname #1\endcsname\fi} + +% @end foo executes the definition of \Efoo. +% +\def\end{\parsearg\endxxx} +\def\endxxx #1{% + \removeactivespaces{#1}% + \edef\endthing{\the\toks0}% + % + \expandafter\ifx\csname E\endthing\endcsname\relax + \expandafter\ifx\csname \endthing\endcsname\relax + % There's no \foo, i.e., no ``environment'' foo. + \errhelp = \EMsimple + \errmessage{Undefined command `@end \endthing'}% + \else + \unmatchedenderror\endthing + \fi + \else + % Everything's ok; the right environment has been started. + \csname E\endthing\endcsname + \fi +} + +% There is an environment #1, but it hasn't been started. Give an error. +% +\def\unmatchedenderror#1{% + \errhelp = \EMsimple + \errmessage{This `@end #1' doesn't have a matching `@#1'}% +} + +% Define the control sequence \E#1 to give an unmatched @end error. +% +\def\defineunmatchedend#1{% + \expandafter\def\csname E#1\endcsname{\unmatchedenderror{#1}}% +} + + +% Single-spacing is done by various environments. + +\newskip\singlespaceskip \singlespaceskip = \baselineskip +\def\singlespace{% +{\advance \baselineskip by -\singlespaceskip +\kern \baselineskip}% +\baselineskip=\singlespaceskip +} + +%% Simple single-character @ commands + +% @@ prints an @ +% Kludge this until the fonts are right (grr). +\def\@{{\tt \char '100}} + +% This is turned off because it was never documented +% and you can use @w{...} around a quote to suppress ligatures. +%% Define @` and @' to be the same as ` and ' +%% but suppressing ligatures. +%\def\`{{`}} +%\def\'{{'}} + +% Used to generate quoted braces. + +\def\mylbrace {{\tt \char '173}} +\def\myrbrace {{\tt \char '175}} +\let\{=\mylbrace +\let\}=\myrbrace + +% @: forces normal size whitespace following. +\def\:{\spacefactor=1000 } + +% @* forces a line break. +\def\*{\hfil\break\hbox{}\ignorespaces} + +% @. is an end-of-sentence period. +\def\.{.\spacefactor=3000 } + +% @w prevents a word break. Without the \leavevmode, @w at the +% beginning of a paragraph, when TeX is still in vertical mode, would +% produce a whole line of output instead of starting the paragraph. +\def\w#1{\leavevmode\hbox{#1}} + +% @group ... @end group forces ... to be all on one page, by enclosing +% it in a TeX vbox. We use \vtop instead of \vbox to construct the box +% to keep its height that of a normal line. According to the rules for +% \topskip (p.114 of the TeXbook), the glue inserted is +% max (\topskip - \ht (first item), 0). If that height is large, +% therefore, no glue is inserted, and the space between the headline and +% the text is small, which looks bad. +% +\def\group{\begingroup + \ifnum\catcode13=\active \else + \errhelp = \groupinvalidhelp + \errmessage{@group invalid in context where filling is enabled}% + \fi + % + % The \vtop we start below produces a box with normal height and large + % depth; thus, TeX puts \baselineskip glue before it, and (when the + % next line of text is done) \lineskip glue after it. (See p.82 of + % the TeXbook.) But the next line of text also gets us \parskip glue. + % Final result: space below is slightly more than space above. + \def\Egroup{% + \egroup % End the \vtop. + \endgroup % End the \group. + }% + % + \vtop\bgroup + % We have to put a strut on the last line in case the @group is in + % the midst of an example, rather than completely enclosing it. + % Otherwise, the interline space between the last line of the group + % and the first line afterwards is too small. But we can't put the + % strut in \Egroup, since there it would be on a line by itself. + % Hence this just inserts a strut at the beginning of each line. + \everypar = {\strut}% + % + % We do @comment here in case we are called inside an environment, + % such as @example, where each end-of-line in the input causes an + % end-of-line in the output. We don't want the end-of-line after + % the `@group' to put extra space in the output. Since @group + % should appear on a line by itself (according to the Texinfo + % manual), we don't worry about eating any user text. + \comment +} +% +% TeX puts in an \escapechar (i.e., `@') at the beginning of the help +% message, so this ends up printing `@group can only ...'. +% +\newhelp\groupinvalidhelp{% +group can only be used in environments such as @example,^^J% +where each line of input produces a line of output.} + +% @need space-in-mils +% forces a page break if there is not space-in-mils remaining. + +\newdimen\mil \mil=0.001in + +\def\need{\parsearg\needx} + +% Old definition--didn't work. +%\def\needx #1{\par % +%% This method tries to make TeX break the page naturally +%% if the depth of the box does not fit. +%{\baselineskip=0pt% +%\vtop to #1\mil{\vfil}\kern -#1\mil\penalty 10000 +%\prevdepth=-1000pt +%}} + +\def\needx#1{% + % Go into vertical mode, so we don't make a big box in the middle of a + % paragraph. + \par + % + % Don't add any leading before our big empty box, but allow a page + % break, since the best break might be right here. + \allowbreak + \nointerlineskip + \vtop to #1\mil{\vfil}% + % + % TeX does not even consider page breaks if a penalty added to the + % main vertical list is 10000 or more. But in order to see if the + % empty box we just added fits on the page, we must make it consider + % page breaks. On the other hand, we don't want to actually break the + % page after the empty box. So we use a penalty of 9999. + % + % There is an extremely small chance that TeX will actually break the + % page at this \penalty, if there are no other feasible breakpoints in + % sight. (If the user is using lots of big @group commands, which + % almost-but-not-quite fill up a page, TeX will have a hard time doing + % good page breaking, for example.) However, I could not construct an + % example where a page broke at this \penalty; if it happens in a real + % document, then we can reconsider our strategy. + \penalty9999 + % + % Back up by the size of the box, whether we did a page break or not. + \kern -#1\mil + % + % Do not allow a page break right after this kern. + \nobreak +} + +% @br forces paragraph break + +\let\br = \par + +% @dots{} output some dots + +\def\dots{$\ldots$} + +% @page forces the start of a new page + +\def\page{\par\vfill\supereject} + +% @exdent text.... +% outputs text on separate line in roman font, starting at standard page margin + +% This records the amount of indent in the innermost environment. +% That's how much \exdent should take out. +\newskip\exdentamount + +% This defn is used inside fill environments such as @defun. +\def\exdent{\parsearg\exdentyyy} +\def\exdentyyy #1{{\hfil\break\hbox{\kern -\exdentamount{\rm#1}}\hfil\break}} + +% This defn is used inside nofill environments such as @example. +\def\nofillexdent{\parsearg\nofillexdentyyy} +\def\nofillexdentyyy #1{{\advance \leftskip by -\exdentamount +\leftline{\hskip\leftskip{\rm#1}}}} + +%\hbox{{\rm#1}}\hfil\break}} + +% @include file insert text of that file as input. + +\def\include{\parsearg\includezzz} +%Use \input\thisfile to avoid blank after \input, which may be an active +%char (in which case the blank would become the \input argument). +%The grouping keeps the value of \thisfile correct even when @include +%is nested. +\def\includezzz #1{\begingroup +\def\thisfile{#1}\input\thisfile +\endgroup} + +\def\thisfile{} + +% @center line outputs that line, centered + +\def\center{\parsearg\centerzzz} +\def\centerzzz #1{{\advance\hsize by -\leftskip +\advance\hsize by -\rightskip +\centerline{#1}}} + +% @sp n outputs n lines of vertical space + +\def\sp{\parsearg\spxxx} +\def\spxxx #1{\par \vskip #1\baselineskip} + +% @comment ...line which is ignored... +% @c is the same as @comment +% @ignore ... @end ignore is another way to write a comment + +\def\comment{\catcode 64=\other \catcode 123=\other \catcode 125=\other% +\par