Jump to content

readable regex (verbalexpressions)


Recommended Posts

  • Moderators

I saw that SmartCL.RegEx is moved to System.RegEx. This reminded me of VerbalExpressions, which I came across a while ago

VerbalExpressions is basically a library which creates regex strings using functional programming

/^(http)(s)?(\:\/\/)(www\.)?([^\ ]*)$/

The above regular expression string checks for correctly formatted url's, and in 'regular english' would be something like

  • the url must start with either http or https
  • followed by ://
  • and optionally www.
  • and then anything but a space

This can be replaced by a set of function calls :

// Create an example of how to test for correctly formed URLs
const tester = VerEx()
    .startOfLine()
    .then('http')
    .maybe('s')
    .then('://')
    .maybe('www.')
    .anythingBut(' ')
    .endOfLine();

// Create an example URL
const testMe = 'https://www.google.com';

// Use RegExp object's native test() function
if (tester.test(testMe)) {
    alert('We have a correct URL'); // This output will fire
} else {
    alert('The URL is incorrect');
}

which is much more readable

The library is available for quite a few languages, including javascript, so could be included as a native lib.

However the below code is based on a c# implementation, which was translated into Delphi, and now translated for use in SMS. Bit of a roundabout way.

Anyway, here is the code

unit VerbalExpressions;

{*
* ----------------------------------------------------------------------------
* "THE VODKA-WARE LICENSE" (Revision 42):
* <tim@bandenkrieg.hacked.jp> wrote this file. As long as you retain this notice you
* can do whatever you want with this stuff. If we meet some day, and you think
* this stuff is worth it, you can buy me a vodka in return. Tim Schumacher
* ----------------------------------------------------------------------------
https://www.exceptionnotfound.net/use-verbalexpressions-to-create-readable-regexs-in-c/
https://github.com/enko/DelphiVerbalExpressions
updated for use in Smart Pascal by LynkFS
*}

interface

uses
  SmartCL.System, System.RegEx;

type
  VerbalExpressionException = class(Exception);

  TVerbalExpression = class
    private
      FstrSource : string;
      FStrPrefix : string;
      FstrSuffix : string;
      FstrModifier : string;
      function Add(astrValue : string) : TVerbalExpression;
      function getRegEx : TW3RegEx;
    public
      function AnyOf(astrValue : string) : TVerbalExpression;
      function Any(astrValue : string) : TVerbalExpression;
      function Range(astrValue : array of string) : TVerbalExpression;
      function RepeatPrevious(astrValue : integer) : TVerbalExpression;
      function Anything : TVerbalExpression;
      function Sanitize(astrValue : string) : string;
      function Multiple(astrValue : string) : TVerbalExpression;
      function StartOfLine(aboolEnable : boolean = True) : TVerbalExpression;
      function EndOfLine(aboolEnable : boolean = True) : TVerbalExpression;
      function _Then(astrValue : string) : TVerbalExpression;
      function Find(astrValue : string) : TVerbalExpression;
      function Maybe(astrValue : string) : TVerbalExpression;
      function AnythingBut(astrValue : string) : TVerbalExpression;
      function _Or(astrValue : string) : TVerbalExpression;
      function Something : TVerbalExpression;
      function SomethingBut(astrValue : string) : TVerbalExpression;
      function LineBreak : TVerbalExpression;
      function br : TVerbalExpression;
      function tab : TVerbalExpression;
      function word : TVerbalExpression;

      function AddModifier(astrModifier : string) : TVerbalExpression;
      function RemoveModifier(astrModifier : string) : TVerbalExpression;

      function WithAnyCase(aboolEnable : boolean = true) : TVerbalExpression;
      function StopAtFirst(aboolEnable : boolean = true) : TVerbalExpression;
      function SearchOneLine(aboolEnable : boolean = true) : TVerbalExpression;

      function AsString : string;
      function Clear : TVerbalExpression;

      function Test(astrValue : string) : boolean;

      property RegEx : TW3RegEx read getRegEx;

  end;

implementation

{ TVerbalExpression }

function TVerbalExpression.Add(astrValue: string): TVerbalExpression;
begin
  Result := self;
  FstrSource := FstrSource + astrValue;
end;

function TVerbalExpression.AddModifier(astrModifier: string): TVerbalExpression;
begin
  if (Pos(astrModifier,FstrModifier) = -1) then
    FstrModifier := FstrModifier + astrModifier;
  Result := self;
end;

function TVerbalExpression.Any(astrValue: string): TVerbalExpression;
begin
  Result := AnyOf(astrValue);
end;

function TVerbalExpression.AnyOf(astrValue: string): TVerbalExpression;
begin
  Result := Add('['+ astrValue +']');
end;

function TVerbalExpression.Anything: TVerbalExpression;
begin
  Result := Add('(.*)');
end;

function TVerbalExpression.AsString: string;
begin
  Result := FstrSource;
end;

function TVerbalExpression.br: TVerbalExpression;
begin
  Result := LineBreak;
end;

function TVerbalExpression.Clear: TVerbalExpression;
begin
  Result := self;
  FstrSource := '';
  FStrPrefix := '';
  FstrSuffix := '';
  FstrModifier := 'gm';
end;

function TVerbalExpression.getRegEx: TW3RegEx;
begin
  Result := TW3RegEx.Create(FstrSource);
end;

function TVerbalExpression.LineBreak: TVerbalExpression;
begin
  Result := Add('(\n|(\r\n))');
end;

function TVerbalExpression.Multiple(astrValue: string): TVerbalExpression;
begin
  astrValue := sanitize(astrValue);

  if (not ((Copy(astrValue,Length(astrValue)-1,1) = '+') or (Copy(astrValue,Length(astrValue)-1,1) = '*'))) then
    astrValue := astrValue + '+';

  Result := Add(astrValue);
end;

function TVerbalExpression.Range(astrValue: array of string): TVerbalExpression;
var
  LintCounter: Integer;
  LstrValue : string;
begin
  if (Length(astrValue) mod 2) <> 0 then
    raise VerbalExpressionException.Create('Number of args must be even');

  LstrValue := '[';
  LintCounter := 0;
  while LintCounter < Length(astrValue) do begin
    LstrValue := LstrValue + astrValue[LintCounter] + '-' + astrValue[LintCounter+1];
    Inc(LintCounter,2);
  end;
  LstrValue := LstrValue + ']';


  Result := Add(LstrValue);
end;

function TVerbalExpression.RepeatPrevious(astrValue: integer): TVerbalExpression;
var
  LstrValue : string;
begin
  LStrValue := '{' + inttostr(astrValue) + '}';
  Result := Add(LstrValue);
end;

function TVerbalExpression.RemoveModifier(
  astrModifier: string): TVerbalExpression;
begin
//  FstrModifier := StringReplace(FstrModifier,astrModifier,'',[rfReplaceAll]);           //<============

//replaced with code between //// lines
////
  function Replace(Dest, SubStr, Str: string): string;
  var
    Position: Integer;
  begin
    Position:=Pos(SubStr, Dest);
    Delete(Dest, Position, Length(SubStr));
    Insert(Str, Dest, Position);
    Result:=Dest;
  end;

  FstrModifier := Replace(FstrModifier,astrModifier,'');
////

  Result := Self;
end;

function TVerbalExpression.Sanitize(astrValue: string): string;
begin
//  Result := TW3RegEx.Escape(astrValue);                                                  //<===============

//replaced with code between //// lines
////
var myStr := '';
asm
var escapeRegExp;

(function () {
  // Referring to the table here:
  // https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/regexp
  // these characters should be escaped
  // \ ^ $ * + ? . ( ) | { } [ ]
  // These characters only have special meaning inside of brackets
  // they do not need to be escaped, but they MAY be escaped
  // without any adverse effects (to the best of my knowledge and casual testing)
  // : ! , =
  // my test "~!@#$%^&*(){}[]`/=?+\|-_;:'\",<.>".match(/[\#]/g)

  var specials = [
        // order matters for these
          "-"
        , "["
        , "]"
        // order doesn't matter for any of these
        , "/"
        , "{"
        , "}"
        , "("
        , ")"
        , "*"
        , "+"
        , "?"
        , "."
        , "\\"
        , "^"
        , "$"
        , "|"
      ]

      // I choose to escape every character with '\'
      // even though only some strictly require it when inside of []
    , regex = RegExp('[' + specials.join('\\') + ']', 'g')
    ;

  escapeRegExp = function (str) {
    return str.replace(regex, "\\$&");
  };

}());

  @myStr = escapeRegExp(@astrValue);
end;
////

  Result := myStr;
end;

function TVerbalExpression.SearchOneLine(
  aboolEnable: boolean): TVerbalExpression;
begin
  if aboolEnable then
    Result := AddModifier('m')
  else
    Result := RemoveModifier('m');
end;

function TVerbalExpression.Something: TVerbalExpression;
begin
  Result := Add('(.+)');
end;

function TVerbalExpression.SomethingBut(astrValue: string): TVerbalExpression;
begin
  Result := Add('([^'+ Sanitize(astrValue) +']+)');
end;

function TVerbalExpression.StartOfLine(aboolEnable : boolean = True) : TVerbalExpression;
begin
  if aboolEnable then
    FStrPrefix := '^'
  else
    FStrPrefix := '';
  Result := self;
end;

function TVerbalExpression.StopAtFirst(aboolEnable: boolean = true): TVerbalExpression;
begin
  if aboolEnable then
    Result := AddModifier('g')
  else
    Result := RemoveModifier('g');
end;

function TVerbalExpression.tab: TVerbalExpression;
begin
  Result := Add('\t')
end;

function TVerbalExpression.Test(astrValue: string): boolean;
begin
//  Result := RegEx.Match(astrValue).Success;                           //<===============

//replaced with code between //// lines
////
  Result := RegEx.Test(astrValue);
////
end;

function TVerbalExpression.WithAnyCase(aboolEnable: boolean = true): TVerbalExpression;
begin
  if aboolEnable then
    Result := AddModifier('i')
  else
    Result := RemoveModifier('i');
end;

function TVerbalExpression.word: TVerbalExpression;
begin
  Result := Add('\w+');
end;

function TVerbalExpression.EndOfLine(aboolEnable : boolean = True) : TVerbalExpression;
begin
  if aboolEnable then
    FstrSuffix := '$'
  else
    FstrSuffix := '';
  Result := self;
end;

function TVerbalExpression.Find(astrValue: string): TVerbalExpression;
begin
  Result := _Then(astrValue);
end;

function TVerbalExpression._Or(astrValue: string): TVerbalExpression;
begin
  if (Pos('(',FstrPrefix) = -1) then begin
      FStrPrefix := FStrPrefix + '(';
  end;

  if (Pos(')',FstrSuffix) = -1) then begin
      FstrSuffix := FstrSuffix + ')';
  end;

  Add(')|(');

  if Length(astrValue) > 0 then
    Add(Sanitize(astrValue));

  Result := self;
end;

function TVerbalExpression._Then(astrValue : string) : TVerbalExpression;
begin
  Result := Add('('+ Sanitize(astrValue) +')');
end;

function TVerbalExpression.Maybe(astrValue : string) : TVerbalExpression;
begin
  Result := Add('('+ Sanitize(astrValue) +')?');
end;

function TVerbalExpression.AnythingBut(astrValue : string) : TVerbalExpression;
begin
  Result := Add('([^'+ Sanitize(astrValue) +']*)');
end;

end.

 

and usage like this with example check for url, email address and phone nr

 

unit Form1;

interface

uses 
  SmartCL.System, SmartCL.Graphics, SmartCL.Components, SmartCL.Forms, 
  SmartCL.Fonts, SmartCL.Borders, SmartCL.Application, VerbalExpressions;

type
  TForm1 = class(TW3Form)
  private
    {$I 'Form1:intf'}
  protected
    procedure InitializeForm; override;
    procedure InitializeObject; override;
    procedure Resize; override;
    urlExp   : TVerbalExpression;
    emailExp : TVerbalExpression;
    phoneExp : TVerbalExpression;
  end;

implementation

{ TForm1 }

procedure TForm1.InitializeForm;
begin
  inherited;
  // this is a good place to initialize components
end;

procedure TForm1.InitializeObject;
begin
  inherited;
  {$I 'Form1:impl'}

/*
rules for checking simple URLs:
-The URL must start with either "http" or "https".
-The URL must then have "://".
-The URL can then have anything following "://", as long as it is isn't a space.
*/
  urlExp := TVerbalExpression.Create
    .StartOfLine()
    ._Then('http')
    .Maybe('s')
    ._Then('://')
    .Maybe('www.')
    .anythingBut(' ')
    .endOfLine();

  writeln('regexp url : ' + urlExp.AsString);

  var url := "https://github.com";
  if urlExp.Test(url) then writeln(url + ' is valid')
                      else writeln(url + ' is invalid');

/*
rules for checking email addresses:
-The email may start with any text, followed by an '@' symbol.
-After the '@', the email may contain any text (except a blank space), followed by a '.'
-After the '.', the email address may contain any text (except a blank space).
*/
  emailExp := TVerbalExpression.Create
    .StartOfLine()
    .Anything()
    ._Then("@")
    .AnythingBut(" ")
    ._Then(".")
    .AnythingBut(" ")
    .EndOfLine();

  writeln('regexp email : ' + emailExp.asString);

  var email := "test@example.com";
  if emailExp.Test(email) then writeln(email + ' is valid')
                          else writeln(email + ' is invalid');

  email := "test@example";
  if emailExp.Test(email) then writeln(email + ' is valid')
                          else writeln(email + ' is invalid');

/*
rules for checking (australian) phone nrs:
-The phone number may start with "(".
-The phone number must then have 2 digits, each of which are in the range 0-9.
-The phone number may then have ")".
-Following the optional ")", the phone number may also have a space.
-Following the optional space, the phone number must have 4 digits, each in the range 0-9.
-Following this set of digits, the phone number may optionally include a dash ("-") or a space.
-Following the optional dash or space, the phone number must have 4 digits, each in the range 0-9.
*/

  phoneExp := TVerbalExpression.Create
    .StartOfLine()
    .Maybe("(")
    .Range(['0', '9'])
    .RepeatPrevious(2)
    .Maybe(")")
    .Maybe(" ")
    .Range(['0', '9'])
    .RepeatPrevious(4)
    .Maybe(" ")
    .Range(['0', '9'])
    .RepeatPrevious(4)
    .EndOfLine();

  writeln('regexp au-phone : ' + phoneExp.asString);

  var phone := "(02) 2093 9118";
  if phoneExp.Test(phone) then writeln(phone + ' is valid')
                          else writeln(phone + ' is invalid');

  phone := "022093911";
  if phoneExp.Test(phone) then writeln(phone + ' is valid')
                          else writeln(phone + ' is invalid');

end;
 
procedure TForm1.Resize;
begin
  inherited;
end;
 
initialization
  Forms.RegisterForm({$I %FILE%}, TForm1);
end.

 

Link to post
Share on other sites

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

×
×
  • Create New...