Регулярные выражения

Краткое содержание лекции

Теоретические задания | Практические задания

Whenever faced with a problem, some people say `Lets use AWK.' Now, they have two problems.

[D. Tilbrook 1988]

Some people, when confronted with a problem, think
“I know, I'll use regular expressions.” Now they have two problems.

[Jamie Zawinski 12.08.1997]

Разделители

Разделителем может быть любой символ не являющийся буквой, цифрой, обратной косой чертой или каким-либо пробельным символом.

Часто используемыми разделителями являются косые черты (/), знаки решетки (#) и тильды (~).

Не рекомендуется использовать метасимволы

Спецсимволы

[ ] \ / ^ $ . | ? * + ( ) { }

Представление символов

Символьные классы

<?php
preg_match_all(
  '/(\d\d)\d/',
    '123456 789 a10b11c12',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => 123
            [1] => 456
            [2] => 789
        )

    [1] => Array
        (
            [0] => 12
            [1] => 45
            [2] => 78
        )

)
*/

preg_match_all(
  '/(\d\d)\D/',
    '123456 789 a10b11c12',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => 56 
            [1] => 89 
            [2] => 10b
            [3] => 11c
        )

    [1] => Array
        (
            [0] => 56
            [1] => 89
            [2] => 10
            [3] => 11
        )

)
*/
?>

Символьные классы POSIX

Метасимволы

<?php
preg_match_all(
  '/(\d\d)\b\D/',
    '123456 789 a10b11c12',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => 56 
            [1] => 89 
        )

    [1] => Array
        (
            [0] => 56
            [1] => 89
        )

)
*/

preg_match_all(
  '/\G(\d\d)/',
    '123456 789 a10b11c12',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => 12
            [1] => 34
            [2] => 56
        )

    [1] => Array
        (
            [0] => 12
            [1] => 34
            [2] => 56
        )

)
*/

preg_match_all(
  '/^123/',
    '123456 789 a10b11c12',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => 123
        )
)
*/

preg_match_all(
  '/[^123]/',
    '123456 789 a10b11c12',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => 4
            [1] => 5
            [2] => 6
            [3] =>  
            [4] => 7
            [5] => 8
            [6] => 9
            [7] =>  
            [8] => a
            [9] => 0
            [10] => b
            [11] => c
        )
)
*/
?>

Квантификаторы

Квантификаторы могу стоять за:

<?php
preg_match_all(
    '/<.*>/',
    '<h1>Header</h1><div>body</div>',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => <h1>Header</h1><div>body</div>
        )
)
*/

preg_match_all(
    '/<.*?>/',
    '<h1>Header</h1><div>body</div>',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => <h1>
            [1] => </h1>
            [2] => <div>
            [3] => </div>
        )
)
*/
?>
<?php
preg_match_all(
    '/ab(xa)*+a/',
    'abxaabxaa',
    $out
);
print_r($out);
?>

Подмаски

<?php
preg_match_all(
    '/(?:Chapter|Section) [1-9][0-9]{0,1}/U',
    'Chapter 50  Section 85',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => Chapter 5
            [1] => Section 8
        )
)
*/
?>
<?php
preg_match_all(
    '/(?:(Chapter)|(Section)) [1-9][0-9]{0,1}/U',
    'Chapter 50  Section 85',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => Chapter 5
            [1] => Section 8
        )

    [1] => Array
        (
            [0] => Chapter
            [1] => 
        )

    [2] => Array
        (
            [0] => 
            [1] => Section
        )

)
*/
preg_match_all(
    '/(?|(Chapter)|(Section)) [1-9][0-9]{0,1}/U',
    'Chapter 50  Section 85',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => Chapter 5
            [1] => Section 8
        )

    [1] => Array
        (
            [0] => Chapter
            [1] => Section
        )
)
*/
preg_match_all(
    '/(?:(Chapter)|(Section)) [1-9][0-9]{0,1}/U',
    'Section 85',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => Section 8
        )

    [1] => Array
        (
            [0] => 
        )

    [2] => Array
        (
            [0] => Section
        )

)
*/
preg_match_all(
    '/(?|(Chapter)|(Section)) [1-9][0-9]{0,1}/U',
    'Section 85',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => Section 8
        )

    [1] => Array
        (
            [0] => Section
        )
)
*/
?>

Обращение к подмаскам (?P=name), \k<name>, k'name', \1, \g1, \g{1}

<?php
preg_match_all(
  '/(та|ту)-\1/',
    'та-та ту-ту та-ту',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => та-та
            [1] => ту-ту
        )

    [1] => Array
        (
            [0] => та
            [1] => ту
        )

)
*/

preg_match_all(
  '/(\d\d)\d \1/',
    '123456 789 a10b11c12',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
        )

    [1] => Array
        (
        )
)
*/
?>

Что необходимо изменить в последнем примере для получения результата?

<?php
preg_match_all(
  '/(\d\d)\d \1/',
    '123456 459 a10b11c12',
    $out
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => 456 45
        )

    [1] => Array
        (
            [0] => 45
        )

)
*/
?>

Однократные подмаски ```(?>\d+)bar

<?php
preg_match_all(
  '/\d+foo/',
    '123456bar',
    $out
);
print_r($out);

preg_match_all(
  '/(?>\d+)foo/',
    '123456bar',
    $out
);
print_r($out);

preg_match_all(
  '/^.*abcd$/',
    'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
    $out
);
print_r($out);

preg_match_all(
  '/^(?>.*)(?<=abcd)/',
    'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
    $out
);
print_r($out);
?>

Просмотр вперед / назад

Поиск по условию

Рекурсия

Модификаторы

Модификаторы действуют с момента вхождения и до конца регулярного выражения или противоположного модификатора. Некоторые интерпретаторы могут применить модификатор ко всему выражению, а не с момента его вхождения.

(?i-sm) А(?#тут комментарий)Б

Самостоятельно

Функции для работы в php

Экранирует символы \ + * ? [ ^ ] $ ( ) { } = ! < > | : - в регулярных выражениях.

<?php
$subject = array('1', 'а', '2', 'б', '3', 'А', 'Б', '4'); 
$pattern = array('/\d/', '/[а-я]/', '/[1а]/'); 
$replace = array('А:$0', 'Б:$0', 'В:$0'); 

print_r(preg_filter($pattern, $replace, $subject)); 

/*
Array
(
    [0] => А:В:1
    [1] => Б:В:а
    [2] => А:2
    [3] => Б:б
    [4] => А:3
    [7] => А:4
)
*/

print_r(preg_replace($pattern, $replace, $subject)); 

/*
Array
(
    [0] => А:В:1
    [1] => Б:В:а
    [2] => А:2
    [3] => Б:б
    [4] => А:3
    [5] => А
    [6] => Б
    [7] => А:4
)
*/
?>
<?php
$keywords = preg_split("/[\s,]+/", "hypertext language, programming");
print_r($keywords);
/*
Array
(
    [0] => hypertext
    [1] => language
    [2] => programming
)
*/

$str = 'string';
$chars = preg_split('//', $str, -1);
print_r($chars);
/*
Array
(
    [0] => 
    [1] => s
    [2] => t
    [3] => r
    [4] => i
    [5] => n
    [6] => g
    [7] => 
)
*/

$str = 'string';
$chars = preg_split('//', $str, -1, PREG_SPLIT_NO_EMPTY);
print_r($chars);
/*
Array
(
    [0] => s
    [1] => t
    [2] => r
    [3] => i
    [4] => n
    [5] => g
)
*/

$regexp_code = "/( )/U";
$regexp_text = "abccaxcc fff";
$out = preg_split($regexp_code,$regexp_text, -1, PREG_SPLIT_DELIM_CAPTURE);
print_r($out);
/*
Array
(
    [0] => abccaxcc
    [1] =>  
    [2] => fff
)
*/

$regexp_code = "/( )/U";
$regexp_text = "abccaxcc fff";
$out = preg_split($regexp_code,$regexp_text, -1, PREG_SPLIT_OFFSET_CAPTURE|PREG_SPLIT_DELIM_CAPTURE);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => abccaxcc
            [1] => 0
        )

    [1] => Array
        (
            [0] =>  
            [1] => 8
        )

    [2] => Array
        (
            [0] => fff
            [1] => 9
        )

)
*/
?>
<?php
$subject = "abcdef";
$pattern = '/^def/';
preg_match($pattern, $subject, $matches, PREG_OFFSET_CAPTURE, 3);
print_r($matches);
/*
Array
(
)
*/

$subject = "abcdef";
$pattern = '/def/';
preg_match($pattern, $subject, $matches, PREG_OFFSET_CAPTURE, 3);
print_r($matches);
/*
Array
(
    [0] => Array
        (
            [0] => def
            [1] => 3
        )

)
*/

$subject = "abcdef";
$pattern = '/^def/';
preg_match($pattern, substr($subject,3), $matches, PREG_OFFSET_CAPTURE);
print_r($matches);
/*
Array
(
    [0] => Array
        (
            [0] => def
            [1] => 0
        )

)
*/
?>
<?php
preg_match_all(
    "|<[^>]+>(.*)</[^>]+>|U",
    "<b>пример: </b><div align=left>это тест</div>",
    $out, 
    PREG_PATTERN_ORDER
);
print_r($out);
/*
Array
(
    [0] => Array
        (
            [0] => <b>пример: </b>
            [1] => <div align=left>это тест</div>
        )

    [1] => Array
        (
            [0] => пример: 
            [1] => это тест
        )

)
*/

preg_match_all(
    "|<[^>]+>(.*)</[^>]+>|U",
    "<b>пример: </b><div align=left>это тест</div>",
    $out, 
    PREG_SET_ORDER
);
print_r($out);

/*
Array
(
    [0] => Array
        (
            [0] => <b>пример: </b>
            [1] => пример: 
        )

    [1] => Array
        (
            [0] => <div align=left>это тест</div>
            [1] => это тест
        )

)
*/
?>

Полезные ссылки