regex.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200
  1. /******************************************************************************
  2. * This file is part of TinTin++ *
  3. * *
  4. * Copyright 2004-2020 Igor van den Hoven *
  5. * *
  6. * TinTin++ is free software; you can redistribute it and/or modify *
  7. * it under the terms of the GNU General Public License as published by *
  8. * the Free Software Foundation; either version 3 of the License, or *
  9. * (at your option) any later version. *
  10. * *
  11. * This program is distributed in the hope that it will be useful, *
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of *
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
  14. * GNU General Public License for more details. *
  15. * *
  16. * You should have received a copy of the GNU General Public License *
  17. * along with TinTin++. If not, see https://www.gnu.org/licenses. *
  18. ******************************************************************************/
  19. /******************************************************************************
  20. * T I N T I N + + *
  21. * *
  22. * coded by Igor van den Hoven 2004 *
  23. ******************************************************************************/
  24. #include <sys/types.h>
  25. #include <pcre.h>
  26. #include "tintin.h"
  27. int match(struct session *ses, char *str, char *exp, int sub)
  28. {
  29. char expbuf[BUFFER_SIZE];
  30. sprintf(expbuf, "\\A%s\\Z", exp);
  31. substitute(ses, expbuf, expbuf, sub);
  32. return tintin_regexp(ses, NULL, str, expbuf, 0, 0);
  33. }
  34. int find(struct session *ses, char *str, char *exp, int sub, int flag)
  35. {
  36. if (HAS_BIT(sub, SUB_VAR|SUB_FUN))
  37. {
  38. char expbuf[BUFFER_SIZE], strbuf[BUFFER_SIZE];
  39. substitute(ses, str, strbuf, SUB_VAR|SUB_FUN);
  40. substitute(ses, exp, expbuf, SUB_VAR|SUB_FUN);
  41. return tintin_regexp(ses, NULL, strbuf, expbuf, 0, flag);
  42. }
  43. else
  44. {
  45. return tintin_regexp(ses, NULL, str, exp, 0, flag);
  46. }
  47. }
  48. DO_COMMAND(do_regexp)
  49. {
  50. char arg1[BUFFER_SIZE], arg2[BUFFER_SIZE], is_t[BUFFER_SIZE], is_f[BUFFER_SIZE];
  51. arg = sub_arg_in_braces(ses, arg, arg1, GET_ONE, SUB_VAR|SUB_FUN);
  52. arg = sub_arg_in_braces(ses, arg, arg2, GET_ONE, SUB_VAR|SUB_FUN);
  53. arg = get_arg_in_braces(ses, arg, is_t, GET_ALL);
  54. arg = get_arg_in_braces(ses, arg, is_f, GET_ALL);
  55. if (*is_t == 0)
  56. {
  57. show_error(ses, LIST_COMMAND, "SYNTAX: #REGEXP {string} {expression} {true} {false}.");
  58. }
  59. else
  60. {
  61. if (tintin_regexp(ses, NULL, arg1, arg2, 0, REGEX_FLAG_CMD))
  62. {
  63. substitute(ses, is_t, is_t, SUB_CMD);
  64. ses = script_driver(ses, LIST_COMMAND, is_t);
  65. }
  66. else if (*is_f)
  67. {
  68. ses = script_driver(ses, LIST_COMMAND, is_f);
  69. }
  70. }
  71. return ses;
  72. }
  73. int regexp_compare(struct session *ses, pcre *nodepcre, char *str, char *exp, int option, int flag)
  74. {
  75. pcre *regex;
  76. int i, j, matches, match[303];
  77. if (nodepcre == NULL)
  78. {
  79. regex = regexp_compile(ses, exp, option);
  80. }
  81. else
  82. {
  83. regex = nodepcre;
  84. }
  85. if (regex == NULL)
  86. {
  87. return FALSE;
  88. }
  89. matches = pcre_exec(regex, NULL, str, strlen(str), 0, 0, match, 303);
  90. if (matches <= 0)
  91. {
  92. if (nodepcre == NULL)
  93. {
  94. free(regex);
  95. }
  96. return FALSE;
  97. }
  98. // REGEX_FLAG_FIX handles %1 to %99 usage. Backward compatibility.
  99. switch (flag)
  100. {
  101. case REGEX_FLAG_CMD:
  102. for (i = 0 ; i < matches ; i++)
  103. {
  104. gtd->cmds[i] = restringf(gtd->cmds[i], "%.*s", match[i*2+1] - match[i*2], &str[match[i*2]]);
  105. }
  106. break;
  107. case REGEX_FLAG_CMD + REGEX_FLAG_FIX:
  108. for (i = 0 ; i < matches ; i++)
  109. {
  110. j = gtd->args[i];
  111. gtd->cmds[j] = restringf(gtd->cmds[j], "%.*s", match[i*2+1] - match[i*2], &str[match[i*2]]);
  112. }
  113. break;
  114. case REGEX_FLAG_ARG:
  115. for (i = 0 ; i < matches ; i++)
  116. {
  117. gtd->vars[i] = restringf(gtd->vars[i], "%.*s", match[i*2+1] - match[i*2], &str[match[i*2]]);
  118. }
  119. break;
  120. case REGEX_FLAG_ARG + REGEX_FLAG_FIX:
  121. for (i = 0 ; i < matches ; i++)
  122. {
  123. j = gtd->args[i];
  124. gtd->vars[j] = restringf(gtd->vars[j], "%.*s", match[i*2+1] - match[i*2], &str[match[i*2]]);
  125. }
  126. break;
  127. }
  128. if (nodepcre == NULL)
  129. {
  130. free(regex);
  131. }
  132. return TRUE;
  133. }
  134. pcre *regexp_compile(struct session *ses, char *exp, int option)
  135. {
  136. const char *error;
  137. int i;
  138. /*
  139. if (HAS_BIT(ses->charset, CHARSET_FLAG_UTF8))
  140. {
  141. option |= PCRE_UTF8|PCRE_NO_UTF8_CHECK;
  142. }
  143. */
  144. return pcre_compile(exp, option, &error, &i, NULL);
  145. }
  146. /******************************************************************************
  147. * Calls tintin_regexp checking if the string matches, and automatically fills *
  148. * in the text represented by the wildcards on success. *
  149. ******************************************************************************/
  150. int check_one_regexp(struct session *ses, struct listnode *node, char *line, char *original, int option)
  151. {
  152. char *exp, *str;
  153. if (node->regex == NULL)
  154. {
  155. char result[BUFFER_SIZE];
  156. substitute(ses, node->arg1, result, SUB_VAR|SUB_FUN);
  157. exp = result;
  158. }
  159. else
  160. {
  161. exp = node->arg1;
  162. }
  163. if (*node->arg1 == '~')
  164. {
  165. exp++;
  166. str = original;
  167. }
  168. else
  169. {
  170. str = line;
  171. }
  172. return tintin_regexp(ses, node->regex, str, exp, option, REGEX_FLAG_ARG);
  173. }
  174. /*
  175. Keep synched with tintin_regexp and tintin_regexp_compile
  176. */
  177. int get_regex_range(char *in, char *out, int *var, int *arg)
  178. {
  179. char *pti, *pto, *ptr, range[BUFFER_SIZE];
  180. pto = out;
  181. pti = in;
  182. ptr = range;
  183. while (*pti)
  184. {
  185. switch (*pti)
  186. {
  187. case '0':
  188. case '1':
  189. case '2':
  190. case '3':
  191. case '4':
  192. case '5':
  193. case '6':
  194. case '7':
  195. case '8':
  196. case '9':
  197. *ptr++ = *pti++;
  198. continue;
  199. case '.':
  200. if (pti[1] != '.')
  201. {
  202. goto end;
  203. }
  204. if (ptr == range)
  205. {
  206. *ptr++ = '0';
  207. }
  208. *ptr++ = ',';
  209. pti += 2;
  210. continue;
  211. case 'a':
  212. pto += sprintf(pto, "(.");
  213. break;
  214. case 'A':
  215. pto += sprintf(pto, "(\\n");
  216. break;
  217. case 'd':
  218. pto += sprintf(pto, "([0-9]");
  219. break;
  220. case 'D':
  221. pto += sprintf(pto, "([^0-9]");
  222. break;
  223. case 'p':
  224. pto += sprintf(pto, "([\\x20-\\xfe]");
  225. break;
  226. case 'P':
  227. pto += sprintf(pto, "([^\\x20-\\xfe]");
  228. break;
  229. case 's':
  230. pto += sprintf(pto, "(\\s");
  231. break;
  232. case 'S':
  233. pto += sprintf(pto, "(\\S");
  234. break;
  235. case 'u':
  236. pto += sprintf(pto, "((?:[\\xC0-\\xFE][\\x80-\\xC0]{1,3})");
  237. break;
  238. case 'U':
  239. pto += sprintf(pto, "([\\x00-\\x7F\\xFF]");
  240. break;
  241. case 'w':
  242. pto += sprintf(pto, "([a-zA-Z]");
  243. break;
  244. case 'W':
  245. pto += sprintf(pto, "([^a-zA-Z]");
  246. break;
  247. default:
  248. goto end;
  249. }
  250. *ptr = 0;
  251. pti++;
  252. pto += sprintf(pto, "{%s}%s", range, *pti ? "?)" : ")");
  253. return pti - in;
  254. }
  255. end:
  256. if (var)
  257. {
  258. gtd->args[next_arg(*var)] = next_arg(*arg);
  259. }
  260. strcpy(out, in[2] == 0 ? "(.+)" : "(.+?)");
  261. return 0;
  262. }
  263. int tintin_regexp_check(struct session *ses, char *exp)
  264. {
  265. if (*exp == '^')
  266. {
  267. return TRUE;
  268. }
  269. while (*exp)
  270. {
  271. if (HAS_BIT(ses->charset, CHARSET_FLAG_EUC) && is_euc_head(ses, exp))
  272. {
  273. exp += 2;
  274. continue;
  275. }
  276. switch (exp[0])
  277. {
  278. case '\\':
  279. case '{':
  280. return TRUE;
  281. case '$':
  282. if (exp[1] == 0)
  283. {
  284. return TRUE;
  285. }
  286. break;
  287. case '%':
  288. switch (exp[1])
  289. {
  290. case '0':
  291. case '1':
  292. case '2':
  293. case '3':
  294. case '4':
  295. case '5':
  296. case '6':
  297. case '7':
  298. case '8':
  299. case '9':
  300. case 'a':
  301. case 'A':
  302. case 'd':
  303. case 'D':
  304. case 'i':
  305. case 'I':
  306. case 'p':
  307. case 'P':
  308. case 's':
  309. case 'S':
  310. case 'u':
  311. case 'U':
  312. case 'w':
  313. case 'W':
  314. case '?':
  315. case '*':
  316. case '+':
  317. case '.':
  318. case '%':
  319. return TRUE;
  320. case '!':
  321. switch (exp[2])
  322. {
  323. case 'a':
  324. case 'A':
  325. case 'd':
  326. case 'D':
  327. case 'p':
  328. case 'P':
  329. case 's':
  330. case 'S':
  331. case 'u':
  332. case 'U':
  333. case 'w':
  334. case 'W':
  335. case '?':
  336. case '*':
  337. case '+':
  338. case '.':
  339. case '{':
  340. return TRUE;
  341. }
  342. break;
  343. }
  344. break;
  345. }
  346. exp++;
  347. }
  348. return FALSE;
  349. }
  350. int tintin_regexp(struct session *ses, pcre *nodepcre, char *str, char *exp, int option, int flag)
  351. {
  352. char out[BUFFER_SIZE], *pti, *pto;
  353. int arg = 1, var = 1, fix = 0;
  354. pti = exp;
  355. pto = out;
  356. while (*pti == '^')
  357. {
  358. *pto++ = *pti++;
  359. }
  360. while (*pti)
  361. {
  362. if (HAS_BIT(ses->charset, CHARSET_FLAG_EUC) && is_euc_head(ses, pti))
  363. {
  364. *pto++ = *pti++;
  365. switch (*pti)
  366. {
  367. case '\\':
  368. case '[':
  369. case ']':
  370. case '(':
  371. case ')':
  372. case '|':
  373. case '.':
  374. case '?':
  375. case '+':
  376. case '*':
  377. case '$':
  378. case '^':
  379. *pto++ = '\\';
  380. break;
  381. }
  382. *pto++ = *pti++;
  383. continue;
  384. }
  385. switch (pti[0])
  386. {
  387. case '\\':
  388. *pto++ = *pti++;
  389. *pto++ = *pti++;
  390. break;
  391. case '{':
  392. gtd->args[next_arg(var)] = next_arg(arg);
  393. *pto++ = '(';
  394. pti = get_arg_in_braces(ses, pti, pto, GET_ALL);
  395. pto += strlen(pto);
  396. *pto++ = ')';
  397. break;
  398. case '[':
  399. case ']':
  400. case '(':
  401. case ')':
  402. case '|':
  403. case '.':
  404. case '?':
  405. case '+':
  406. case '*':
  407. case '^':
  408. *pto++ = '\\';
  409. *pto++ = *pti++;
  410. break;
  411. case '$':
  412. if (pti[1] != DEFAULT_OPEN && !isalnum((int) pti[1]))
  413. {
  414. int i = 0;
  415. while (pti[++i] == '$')
  416. {
  417. continue;
  418. }
  419. if (pti[i])
  420. {
  421. *pto++ = '\\';
  422. }
  423. }
  424. *pto++ = *pti++;
  425. break;
  426. case '%':
  427. switch (pti[1])
  428. {
  429. case '0':
  430. case '1':
  431. case '2':
  432. case '3':
  433. case '4':
  434. case '5':
  435. case '6':
  436. case '7':
  437. case '8':
  438. case '9':
  439. fix = REGEX_FLAG_FIX;
  440. arg = isdigit((int) pti[2]) ? (pti[1] - '0') * 10 + (pti[2] - '0') : pti[1] - '0';
  441. gtd->args[next_arg(var)] = next_arg(arg);
  442. pti += isdigit((int) pti[2]) ? 3 : 2;
  443. strcpy(pto, *pti == 0 ? "(.*)" : "(.*?)");
  444. pto += strlen(pto);
  445. break;
  446. case 'a':
  447. gtd->args[next_arg(var)] = next_arg(arg);
  448. pti += 2;
  449. strcpy(pto, *pti == 0 ? "([^\\n]*)" : "([^\\n]*?)");
  450. pto += strlen(pto);
  451. break;
  452. case 'A':
  453. gtd->args[next_arg(var)] = next_arg(arg);
  454. pti += 2;
  455. strcpy(pto, *pti == 0 ? "(\\n*)" : "(\\n*?)");
  456. pto += strlen(pto);
  457. break;
  458. case 'd':
  459. gtd->args[next_arg(var)] = next_arg(arg);
  460. pti += 2;
  461. strcpy(pto, *pti == 0 ? "([0-9]*)" : "([0-9]*?)");
  462. pto += strlen(pto);
  463. break;
  464. case 'D':
  465. gtd->args[next_arg(var)] = next_arg(arg);
  466. pti += 2;
  467. strcpy(pto, *pti == 0 ? "([^0-9]*)" : "([^0-9]*?)");
  468. pto += strlen(pto);
  469. break;
  470. case 'i':
  471. pti += 2;
  472. strcpy(pto, "(?i)");
  473. pto += strlen(pto);
  474. break;
  475. case 'I':
  476. pti += 2;
  477. strcpy(pto, "(?-i)");
  478. pto += strlen(pto);
  479. break;
  480. case 'p':
  481. gtd->args[next_arg(var)] = next_arg(arg);
  482. pti += 2;
  483. pto += sprintf(pto, "%s", *pti == 0 ? "([\\x20-\\xfe]*)" : "([\\x20-\\xfe]*?)");
  484. break;
  485. case 'P':
  486. gtd->args[next_arg(var)] = next_arg(arg);
  487. pti += 2;
  488. pto += sprintf(pto, "%s", *pti == 0 ? "([^\\x20-\\xfe]*)" : "([^\\x20-\\xfe]*?)");
  489. break;
  490. case 's':
  491. gtd->args[next_arg(var)] = next_arg(arg);
  492. pti += 2;
  493. strcpy(pto, *pti == 0 ? "(\\s*)" : "(\\s*?)");
  494. pto += strlen(pto);
  495. break;
  496. case 'S':
  497. gtd->args[next_arg(var)] = next_arg(arg);
  498. pti += 2;
  499. strcpy(pto, *pti == 0 ? "(\\S*)" : "(\\S*?)");
  500. pto += strlen(pto);
  501. break;
  502. case 'u':
  503. gtd->args[next_arg(var)] = next_arg(arg);
  504. pti += 2;
  505. strcpy(pto, *pti == 0 ? "((?:[\\x00-\\x7F|\\xC0-\\xFE][\\x80-\\xC0]{1,3})*)" : "((?:[\\xC0-\\xFE][\\x80-\\xC0]{1,3})*?)");
  506. pto += strlen(pto);
  507. break;
  508. case 'U':
  509. gtd->args[next_arg(var)] = next_arg(arg);
  510. pti += 2;
  511. strcpy(pto, *pti == 0 ? "(^[\xFF]*)" : "([\\x00-\\x7F\\xFF]*?)");
  512. pto += strlen(pto);
  513. break;
  514. case 'w':
  515. gtd->args[next_arg(var)] = next_arg(arg);
  516. pti += 2;
  517. strcpy(pto, *pti == 0 ? "([a-zA-Z]*)" : "([a-zA-Z]*?)");
  518. pto += strlen(pto);
  519. break;
  520. case 'W':
  521. gtd->args[next_arg(var)] = next_arg(arg);
  522. pti += 2;
  523. strcpy(pto, *pti == 0 ? "([^a-zA-Z]*)" : "([^a-zA-Z]*?)");
  524. pto += strlen(pto);
  525. break;
  526. case '*':
  527. gtd->args[next_arg(var)] = next_arg(arg);
  528. pti += 2;
  529. strcpy(pto, *pti == 0 ? "(.*)" : "(.*?)");
  530. pto += strlen(pto);
  531. break;
  532. case '+':
  533. pti += 2 + get_regex_range(&pti[2], pto, &var, &arg);
  534. pto += strlen(pto);
  535. break;
  536. case '%':
  537. *pto++ = *pti++;
  538. pti++;
  539. break;
  540. case '.':
  541. gtd->args[next_arg(var)] = next_arg(arg);
  542. pti += 2;
  543. strcpy(pto, "(.)");
  544. pto += strlen(pto);
  545. break;
  546. case '?':
  547. gtd->args[next_arg(var)] = next_arg(arg);
  548. pti += 2;
  549. strcpy(pto, *pti == 0 ? "(.?)" : "(.?" "?)");
  550. pto += strlen(pto);
  551. break;
  552. case '!':
  553. switch (pti[2])
  554. {
  555. case 'a':
  556. gtd->args[next_arg(var)] = next_arg(arg);
  557. pti += 2;
  558. strcpy(pto, *pti == 0 ? "[^\\n]*" : "[^\\n]*?");
  559. pto += strlen(pto);
  560. break;
  561. case 'A':
  562. gtd->args[next_arg(var)] = next_arg(arg);
  563. pti += 2;
  564. strcpy(pto, *pti == 0 ? "\\n*" : "\\n*?");
  565. pto += strlen(pto);
  566. break;
  567. case 'd':
  568. pti += 3;
  569. strcpy(pto, *pti == 0 ? "[0-9]*" : "[0-9]*?");
  570. pto += strlen(pto);
  571. break;
  572. case 'D':
  573. pti += 3;
  574. strcpy(pto, *pti == 0 ? "[^0-9]*" : "[^0-9]*?");
  575. pto += strlen(pto);
  576. break;
  577. case 'p':
  578. pti += 3;
  579. pto += sprintf(pto, "%s", *pti == 0 ? "[\\x20-\\xfe]*" : "[\\x20-\\xfe]*?");
  580. break;
  581. case 'P':
  582. pti += 3;
  583. pto += sprintf(pto, "%s", *pti == 0 ? "[^\\x20-\\xfe]*" : "[^\\x20-\\xfe]*?");
  584. break;
  585. case 's':
  586. pti += 3;
  587. strcpy(pto, *pti == 0 ? "\\s*" : "\\s*?");
  588. pto += strlen(pto);
  589. break;
  590. case 'S':
  591. pti += 3;
  592. strcpy(pto, *pti == 0 ? "\\S*" : "\\S*?");
  593. pto += strlen(pto);
  594. break;
  595. case 'u':
  596. gtd->args[next_arg(var)] = next_arg(arg);
  597. pti += 3;
  598. strcpy(pto, *pti == 0 ? "(?:[\\xC0-\\xFE][\\x80-\\xC0]{1,3})*" : "(?:[\\xC0-\\xFE][\\x80-\\xC0]{1,3})*?");
  599. pto += strlen(pto);
  600. break;
  601. case 'U':
  602. gtd->args[next_arg(var)] = next_arg(arg);
  603. pti += 3;
  604. strcpy(pto, *pti == 0 ? "[\\x00-\\x7F\\xFF]*" : "[\\x00-\\x7F\\xFF]*?");
  605. pto += strlen(pto);
  606. break;
  607. case 'w':
  608. pti += 3;
  609. strcpy(pto, *pti == 0 ? "[a-zA-Z]*" : "[a-zA-Z]*?");
  610. pto += strlen(pto);
  611. break;
  612. case 'W':
  613. pti += 3;
  614. strcpy(pto, *pti == 0 ? "[^a-zA-Z]*" : "[^a-zA-Z]*?");
  615. pto += strlen(pto);
  616. break;
  617. case '?':
  618. pti += 3;
  619. strcpy(pto, *pti == 0 ? ".?" : ".?" "?");
  620. pto += strlen(pto);
  621. break;
  622. case '*':
  623. pti += 3;
  624. strcpy(pto, *pti == 0 ? ".*" : ".*?");
  625. pto += strlen(pto);
  626. break;
  627. case '+':
  628. pti += 3 + get_regex_range(&pti[3], pto, NULL, NULL);
  629. pto += strlen(pto);
  630. break;
  631. case '.':
  632. pti += 3;
  633. strcpy(pto, ".");
  634. pto += strlen(pto);
  635. break;
  636. case '{':
  637. pti = get_arg_in_braces(ses, pti+2, pto, GET_ALL);
  638. pto += strlen(pto);
  639. break;
  640. default:
  641. *pto++ = *pti++;
  642. break;
  643. }
  644. break;
  645. default:
  646. *pto++ = *pti++;
  647. break;
  648. }
  649. break;
  650. default:
  651. *pto++ = *pti++;
  652. break;
  653. }
  654. }
  655. *pto = 0;
  656. return regexp_compare(ses, nodepcre, str, out, option, flag + fix);
  657. }
  658. pcre *tintin_regexp_compile(struct session *ses, struct listnode *node, char *exp, int option)
  659. {
  660. char out[BUFFER_SIZE], *pti, *pto;
  661. pti = exp;
  662. pto = out;
  663. if (*pti == '~')
  664. {
  665. pti++;
  666. }
  667. while (*pti == '^')
  668. {
  669. *pto++ = *pti++;
  670. }
  671. while (*pti)
  672. {
  673. if (HAS_BIT(ses->charset, CHARSET_FLAG_EUC) && is_euc_head(ses, pti))
  674. {
  675. *pto++ = *pti++;
  676. switch (*pti)
  677. {
  678. case '\\':
  679. case '[':
  680. case ']':
  681. case '(':
  682. case ')':
  683. case '|':
  684. case '.':
  685. case '?':
  686. case '+':
  687. case '*':
  688. case '$':
  689. case '^':
  690. *pto++ = '\\';
  691. break;
  692. }
  693. *pto++ = *pti++;
  694. continue;
  695. }
  696. switch (pti[0])
  697. {
  698. case '\\':
  699. *pto++ = *pti++;
  700. *pto++ = *pti++;
  701. break;
  702. case '{':
  703. *pto++ = '(';
  704. pti = get_arg_in_braces(ses, pti, pto, GET_ALL);
  705. while (*pto)
  706. {
  707. if (pto[0] == '$' || pto[0] == '@')
  708. {
  709. if (pto[1])
  710. {
  711. return NULL;
  712. }
  713. }
  714. pto++;
  715. }
  716. *pto++ = ')';
  717. break;
  718. case '&':
  719. if (pti[1] == DEFAULT_OPEN || isalnum((int) pti[1]) || pti[1] == '&')
  720. {
  721. return NULL;
  722. }
  723. *pto++ = *pti++;
  724. break;
  725. case '@':
  726. if (pti[1] == DEFAULT_OPEN || isalnum((int) pti[1]) || pti[1] == '@')
  727. {
  728. return NULL;
  729. }
  730. *pto++ = *pti++;
  731. break;
  732. case '$':
  733. if (pti[1] == DEFAULT_OPEN || isalnum((int) pti[1]))
  734. {
  735. return NULL;
  736. }
  737. {
  738. int i = 0;
  739. while (pti[++i] == '$')
  740. {
  741. continue;
  742. }
  743. if (pti[i])
  744. {
  745. *pto++ = '\\';
  746. }
  747. }
  748. *pto++ = *pti++;
  749. break;
  750. case '[':
  751. case ']':
  752. case '(':
  753. case ')':
  754. case '|':
  755. case '.':
  756. case '?':
  757. case '+':
  758. case '*':
  759. case '^':
  760. *pto++ = '\\';
  761. *pto++ = *pti++;
  762. break;
  763. case '%':
  764. switch (pti[1])
  765. {
  766. case '0':
  767. case '1':
  768. case '2':
  769. case '3':
  770. case '4':
  771. case '5':
  772. case '6':
  773. case '7':
  774. case '8':
  775. case '9':
  776. pti += isdigit((int) pti[2]) ? 3 : 2;
  777. strcpy(pto, *pti == 0 ? "(.*)" : "(.*?)");
  778. pto += strlen(pto);
  779. break;
  780. case 'd':
  781. pti += 2;
  782. strcpy(pto, *pti == 0 ? "([0-9]*)" : "([0-9]*?)");
  783. pto += strlen(pto);
  784. break;
  785. case 'D':
  786. pti += 2;
  787. strcpy(pto, *pti == 0 ? "([^0-9]*)" : "([^0-9]*?)");
  788. pto += strlen(pto);
  789. break;
  790. case 'i':
  791. pti += 2;
  792. strcpy(pto, "(?i)");
  793. pto += strlen(pto);
  794. break;
  795. case 'I':
  796. pti += 2;
  797. strcpy(pto, "(?-i)");
  798. pto += strlen(pto);
  799. break;
  800. case 'p':
  801. pti += 2;
  802. pto += sprintf(pto, "%s", *pti == 0 ? "([\\x20-\\xfe]*)" : "([\\x20-\\xfe]*?)");
  803. break;
  804. case 'P':
  805. pti += 2;
  806. pto += sprintf(pto, "%s", *pti == 0 ? "([^\\x20-\\xfe]*)" : "([^\\x20-\\xfe]*?)");
  807. break;
  808. case 's':
  809. pti += 2;
  810. strcpy(pto, *pti == 0 ? "(\\s*)" : "(\\s*?)");
  811. pto += strlen(pto);
  812. break;
  813. case 'S':
  814. pti += 2;
  815. strcpy(pto, *pti == 0 ? "(\\S*)" : "(\\S*?)");
  816. pto += strlen(pto);
  817. break;
  818. case 'u':
  819. pti += 2;
  820. strcpy(pto, *pti == 0 ? "((?:[\\xC0-\\xFE][\\x80-\\xC0]{1,3})*)" : "((?:[\\xC0-\\xFE][\\x80-\\xC0]{1,3})*?)");
  821. pto += strlen(pto);
  822. break;
  823. case 'U':
  824. pti += 2;
  825. strcpy(pto, *pti == 0 ? "([\\x00-\\x7F\\xFF]*)" : "([\\x00-\\x7F\\xFF]*?)");
  826. pto += strlen(pto);
  827. break;
  828. case 'w':
  829. pti += 2;
  830. strcpy(pto, *pti == 0 ? "([a-zA-Z]*)" : "([a-zA-Z]*?)");
  831. pto += strlen(pto);
  832. break;
  833. case 'W':
  834. pti += 2;
  835. strcpy(pto, *pti == 0 ? "([^a-zA-Z]*)" : "([^a-zA-Z]*?)");
  836. pto += strlen(pto);
  837. break;
  838. case '?':
  839. pti += 2;
  840. strcpy(pto, *pti == 0 ? "(.?)" : "(.?" "?)");
  841. pto += strlen(pto);
  842. break;
  843. case '*':
  844. pti += 2;
  845. strcpy(pto, *pti == 0 ? "(.*)" : "(.*?)");
  846. pto += strlen(pto);
  847. break;
  848. case '+':
  849. pti += 2 + get_regex_range(&pti[2], pto, NULL, NULL);
  850. pto += strlen(pto);
  851. break;
  852. case '.':
  853. pti += 2;
  854. strcpy(pto, "(.)");
  855. pto += strlen(pto);
  856. break;
  857. case '%':
  858. *pto++ = *pti++;
  859. pti++;
  860. break;
  861. case '!':
  862. switch (pti[2])
  863. {
  864. case 'd':
  865. pti += 3;
  866. strcpy(pto, *pti == 0 ? "[0-9]*" : "[0-9]*?");
  867. pto += strlen(pto);
  868. break;
  869. case 'D':
  870. pti += 3;
  871. strcpy(pto, *pti == 0 ? "[^0-9]*" : "[^0-9]*?");
  872. pto += strlen(pto);
  873. break;
  874. case 'p':
  875. pti += 3;
  876. pto += sprintf(pto, "%s", *pti == 0 ? "[\\x21-\\x7E]*" : "[\\x21-\\x7E]?*");
  877. break;
  878. case 'P':
  879. pti += 3;
  880. pto += sprintf(pto, "%s", *pti == 0 ? "[^\\x20-\\xfe]*" : "[^\\x20-\\xfe]*?");
  881. break;
  882. case 's':
  883. pti += 3;
  884. strcpy(pto, *pti == 0 ? "\\s*" : "\\s*?");
  885. pto += strlen(pto);
  886. break;
  887. case 'S':
  888. pti += 3;
  889. strcpy(pto, *pti == 0 ? "\\S*" : "\\S*?");
  890. pto += strlen(pto);
  891. break;
  892. case 'w':
  893. pti += 3;
  894. strcpy(pto, *pti == 0 ? "[a-zA-Z]*" : "[a-zA-Z]*?");
  895. pto += strlen(pto);
  896. break;
  897. case 'W':
  898. pti += 3;
  899. strcpy(pto, *pti == 0 ? "[^a-zA-Z]*" : "[^a-zA-Z]*?");
  900. pto += strlen(pto);
  901. break;
  902. case '?':
  903. pti += 3;
  904. strcpy(pto, *pti == 0 ? ".?" : ".?" "?");
  905. pto += strlen(pto);
  906. break;
  907. case '*':
  908. pti += 3;
  909. strcpy(pto, *pti == 0 ? ".*" : ".*?");
  910. pto += strlen(pto);
  911. break;
  912. case '+':
  913. pti += 3 + get_regex_range(&pti[3], pto, NULL, NULL);
  914. pto += strlen(pto);
  915. break;
  916. case '.':
  917. pti += 3;
  918. strcpy(pto, ".");
  919. pto += strlen(pto);
  920. break;
  921. case '{':
  922. pti = get_arg_in_braces(ses, pti+2, pto, GET_ALL);
  923. while (*pto)
  924. {
  925. if (pto[0] == '$' || pto[0] == '@')
  926. {
  927. if (pto[1])
  928. {
  929. return NULL;
  930. }
  931. }
  932. pto++;
  933. }
  934. break;
  935. default:
  936. *pto++ = *pti++;
  937. break;
  938. }
  939. break;
  940. default:
  941. *pto++ = *pti++;
  942. break;
  943. }
  944. break;
  945. default:
  946. *pto++ = *pti++;
  947. break;
  948. }
  949. }
  950. *pto = 0;
  951. return regexp_compile(ses, out, option);
  952. }
  953. void tintin_macro_compile(char *input, char *output)
  954. {
  955. char *pti, *pto;
  956. pti = input;
  957. pto = output;
  958. if (*pti == '^')
  959. {
  960. pti++;
  961. }
  962. while (*pti)
  963. {
  964. switch (pti[0])
  965. {
  966. case '\\':
  967. switch (pti[1])
  968. {
  969. case 'C':
  970. if (pti[2] == '-' && pti[3])
  971. {
  972. *pto++ = pti[3] - 'a' + 1;
  973. pti += 4;
  974. }
  975. else
  976. {
  977. *pto++ = *pti++;
  978. }
  979. break;
  980. case 'c':
  981. *pto++ = pti[2] % 32;
  982. pti += 3;
  983. break;
  984. case 'a':
  985. *pto++ = ASCII_BEL;
  986. pti += 2;
  987. break;
  988. case 'b':
  989. *pto++ = 127;
  990. pti += 2;
  991. break;
  992. case 'e':
  993. *pto++ = ASCII_ESC;
  994. pti += 2;
  995. break;
  996. case 'r':
  997. *pto++ = ASCII_CR;
  998. pti += 2;
  999. break;
  1000. case 't':
  1001. *pto++ = ASCII_HTAB;
  1002. pti += 2;
  1003. break;
  1004. case 'x':
  1005. if (pti[2] && pti[3])
  1006. {
  1007. *pto++ = hex_number_8bit(&pti[2]);
  1008. pti += 4;
  1009. }
  1010. else
  1011. {
  1012. *pto++ = *pti++;
  1013. }
  1014. break;
  1015. default:
  1016. *pto++ = *pti++;
  1017. break;
  1018. }
  1019. break;
  1020. default:
  1021. *pto++ = *pti++;
  1022. break;
  1023. }
  1024. }
  1025. *pto = 0;
  1026. }