Loading...
Searching...
No Matches
dc_regex.f90
Go to the documentation of this file.
1!-----------------------------------------------------------------------
2! Copyright (c) 2000-2017 Gtool Development Group. All rights reserved.
3!-----------------------------------------------------------------------
4!>
5!> @author Youhei SASAKI
6!> @copyright Copyright (C) GFD Dennou Club, 2000-2016. All rights reserved. <br/>
7!> License is BSD-2-Clause. see [COPYRIGHT](@ref COPYRIGHT) in detail
8!> @en
9!> @brief Provide simple regular expression subroutine: 'match'
10!> @enden
11!>
12!> @ja
13!> @brief シンプルな正規表現関数 'match' を提供します.
14!> @endja
15!>
17 implicit none
18 private
19 public:: match
20
21 character, save :: C_ESCAPE = '#'
22 integer, parameter :: SYM_EOL = -128
23 integer, parameter :: SYM_ANYCHAR = 500
24 integer, parameter :: SYM_QUESTION = 501
25 integer, parameter :: SYM_PLUS = 502
26 integer, parameter :: SYM_STAR = 503
27 integer, parameter :: SYM_NORMAL_SET = 520
28 integer, parameter :: SYM_REVERSED_SET = 521
29 integer, parameter :: SYM_HEADFIX = 540
30 integer, parameter :: SYM_TAILFIX = 541
31 integer, parameter :: SYM_ISDIGIT = 560
32 integer, parameter :: SYM_ISALPHA = 561
33 integer, parameter :: SYM_ISWORD = 562
34 integer, parameter :: SYM_ISSPACE = 563
35 integer, parameter :: SYM_ISXDIGIT = 564
36 integer, parameter :: SYM_COUNT_BASE = 1000
37
38contains
39
40 subroutine preprocess_pattern(pattern, symbols)
41 !
42 ! メタキャラクタと普通の文字を分離
43 !
44 character(len = *), intent(in):: pattern
45 integer, intent(out):: symbols(:)
46 integer:: i, j, code, imax, j_last_set
47 integer:: status, stat_return
48 integer, parameter:: STAT_INIT = 1, stat_escape = 2, &
49 stat_open_set = 3, stat_in_set = 4, stat_hexadecimal = 5
50 character:: c
51 continue
52 status = stat_init
53 stat_return = stat_init
54 symbols(:) = sym_eol
55 j_last_set = 0
56 imax = len_trim(pattern)
57 j = 1
58 do, i = 1, imax
59 c = pattern(i:i)
60 select case(status)
61 case(stat_init)
62 if (c == c_escape) then
63 status = stat_escape
64 cycle
65 else if (c == "[") then
66 symbols(j) = sym_normal_set
67 status = stat_open_set
68 else if (c == ".") then
69 symbols(j) = sym_anychar
70 else if (c == "?") then
71 symbols(j) = sym_question
72 else if (c == "+") then
73 symbols(j) = sym_plus
74 else if (c == "*") then
75 symbols(j) = sym_star
76 else if (c == "^" .and. i == 1) then
77 symbols(j) = sym_headfix
78 else if (c == "$" .and. i == imax) then
79 symbols(j) = sym_tailfix
80 else
81 symbols(j) = ichar(c)
82 endif
83 case(stat_escape)
84 if (c == 'd' .or. c == 'D') then
85 symbols(j) = sym_isdigit
86 else if (c == 'a' .or. c == 'A') then
87 symbols(j) = sym_isalpha
88 else if (c == 'w' .or. c == 'W') then
89 symbols(j) = sym_isword
90 else if (c == 's' .or. c == 'S') then
91 symbols(j) = sym_isspace
92 else if (c == 'z' .or. c == 'Z') then
93 symbols(j) = sym_isxdigit
94 else if (c == 'x' .or. c == 'X') then
95 symbols(j) = -1
96 status = stat_hexadecimal
97 cycle
98 else
99 symbols(j) = ichar(c)
100 end if
101 status = stat_return
102 case(stat_hexadecimal)
103 code = index("123456789ABCDEFabcdef", c)
104 if (code >= 16) code = code - 6
105 if (symbols(j) == -1) then
106 symbols(j) = code
107 cycle
108 else
109 symbols(j) = symbols(j) * 16 + code
110 status = stat_return
111 endif
112 case(stat_open_set)
113 symbols(j) = sym_count_base
114 j_last_set = j
115 stat_return = stat_in_set
116 if (c == '^') then
117 symbols(j - 1) = sym_reversed_set
118 status = stat_in_set
119 else if (c == c_escape) then
120 status = stat_escape
121 else
122 j = j + 1
123 symbols(j) = ichar(c)
124 status = stat_in_set
125 endif
126 case(stat_in_set)
127 if (c == ']') then
128 symbols(j_last_set) = sym_count_base + j - j_last_set - 1
129 stat_return = stat_init
130 status = stat_init
131 cycle
132 else if (c == c_escape) then
133 status = stat_escape
134 cycle
135 else
136 symbols(j) = ichar(c)
137 endif
138 end select
139 j = j + 1
140 enddo
141 select case(status)
142 case(stat_escape)
143 symbols(j) = ichar(' ')
144 case(stat_open_set)
145 symbols(j) = sym_count_base
146 case(stat_in_set)
147 symbols(j_last_set) = sym_count_base + j - j_last_set - 1
148 end select
149 end subroutine preprocess_pattern
150
151 ! マッチすれば length は非負になる。
152 ! マッチしなければ length == -1 となる。
153 recursive subroutine match_here(ipat, text, length)
154 integer, intent(in):: ipat(:)
155 character(len = *), intent(in):: text
156 integer, intent(out):: length
157 integer:: s1, s2, remain, i, hitmax, hitcount, hit_at_least
158 logical:: normal_hit
159 continue
160 ! パターンの終わり。空パターンには何でもマッチ
161 if (size(ipat) == 0 .or. ipat(1) == sym_eol) then
162 length = 0
163 return
164 endif
165 ! パターンの文末固定指示
166 if (ipat(1) == sym_tailfix) then
167 if (text == "") then
168 length = 0
169 else
170 length = -1
171 endif
172 return
173 endif
174 if (len(text) == 0) then
175 length = -1
176 return
177 endif
178 ! 1字指定(範囲または1字リテラル)の抽出 ... ipat(s1:s2)
179 if (ipat(1) == sym_normal_set) then
180 s1 = 3
181 s2 = 2 + ipat(2) - sym_count_base
182 normal_hit = .true.
183 else if (ipat(1) == sym_reversed_set) then
184 s1 = 3
185 s2 = 2 + ipat(2) - sym_count_base
186 normal_hit = .false.
187 else
188 s1 = 1
189 s2 = 1
190 normal_hit = .true.
191 endif
192 ! その次の記号 ipat(s2+1) は量化子か次の1字指定である
193 remain = s2 + 2
194 select case (ipat(s2 + 1))
195 case(sym_star)
196 hitmax = len(text)
197 hit_at_least = 0
198 case(sym_plus)
199 hitmax = len(text)
200 hit_at_least = 1
201 case(sym_question)
202 hitmax = 1
203 hit_at_least = 0
204 case default
205 hitmax = 1
206 hit_at_least = 1
207 remain = s2 + 1
208 end select
209 ! 現位置以降の1字指定のヒット数を数える
210 hitcount = 0
211 do, i = 1, hitmax
212 if (hit(ipat(s1:s2), text(i:i)) .neqv. normal_hit) then
213 exit
214 endif
215 hitcount = i
216 enddo
217 ! 現位置で無ヒットの場合、ヒットを要するならマッチ失敗
218 if (hitcount < hit_at_least) then
219 length = -1
220 return
221 endif
222 ! 最長原理: なるべく長くヒットしたものから、残りのマッチする
223 ! ものを探す。いわゆる最左最長探索の最長である。
224 do, i = 1 + hitcount, 1 + hit_at_least, -1
225 call match_here(ipat(remain: ), text(i: ), length)
226 if (length >= 0) then
227 length = length + i - 1
228 return
229 endif
230 enddo
231 length = -1
232 end subroutine match_here
233
234 logical function hit(ipat, c) result(result)
235 integer, intent(in):: ipat(:)
236 character(len=*), intent(in):: c
237 character(len=*), parameter:: &
238 & DIGIT = "0123456789", &
239 & XDIGIT = "ABCDEFabcdef", &
240 & ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
241 integer:: i
242 continue
243 do, i = 1, size(ipat)
244 select case(ipat(i))
245 case(sym_anychar)
246 result = .true.
247 case(sym_isalpha)
248 result = (index(alpha, c) > 0)
249 case(sym_isdigit)
250 result = (index(digit, c) > 0)
251 case(sym_isword)
252 result = (index(digit, c) > 0 .or. index(alpha, c) > 0 .or. &
253 & c == '_')
254 case(sym_isxdigit)
255 result = (index(digit, c) > 0 .or. index(xdigit, c) > 0)
256 case(sym_isspace)
257 result = (c == ' ' .or. (iachar(c) >= 8 .and. iachar(c) <= 13))
258 case default
259 result = (ipat(i) == ichar(c))
260 end select
261 if (result) return
262 enddo
263 result = .false.
264 end function hit
265
266 subroutine match(pattern, text, start, length)
267 !
268 !> _pattern_ には正規表現を与えます。
269 !> _text_ には正規表現によって探査したい文字列を与えます。
270 !>
271 !> _pattern_ が _text_ にマッチした場合、
272 !> _start_ には文字列の何文字目からマッチしたのかを示す数値 (正の整数)
273 !> が返ります。
274 !> _length_ には何文字分マッチしたのかを示す数値 (正の整数)
275 !> が返ります。
276 !>
277 !> マッチしない場合、 length == -1, start == 0 となります。
278 !>
279 !
280 !> 例
281 !>~~~~~~~~~~~~~~~{.f90}
282 !> program regex_test
283 !> use dc_regex, only: match
284 !> use dc_types, only: TOKEN
285 !> implicit none
286 !>
287 !> integer:: start, length
288 !> character(TOKEN) :: pattern, text
289 !> continue
290 !> pattern = "->"
291 !> text = "time->0.0,x->hoge"
292 !> call match(trim(pattern), trim(text), start, length)
293 !> call formatted_print
294 !>
295 !> pattern = "^##+"
296 !> text = "####### hoge"
297 !> call match(trim(pattern), trim(text), start, length)
298 !> call formatted_print
299 !>
300 !> pattern = "@+$"
301 !> text = "# hoge @@@"
302 !> call match(trim(pattern), trim(text), start, length)
303 !> call formatted_print
304 !>
305 !> contains
306 !> subroutine formatted_print
307 !> use dc_string, only: Printf
308 !> call Printf(fmt='pattern= %c : text= %c : start= %d : length= %d', &
309 !> & c1=trim(pattern), c2=trim(text), i=(/start, length/))
310 !> end subroutine formatted_print
311 !>
312 !> end program regex_test
313 !>~~~~~~~~~~~~~~~
314 !> このプログラムを実行することで以下の出力が得られるはずです。
315 !>
316 !> pattern= -> : text= time->0.0,x->hoge : start= 5 : length= 2
317 !> pattern= ^##+ : text= ####### hoge : start= 1 : length= 7
318 !> pattern= @+$ : text= # hoge @@@ : start= 8 : length= 3
319 !>
320 implicit none
321 character(len = *), intent(in):: pattern, text
322 integer, intent(out):: start, length
323 integer, allocatable:: ipattern(:)
324 integer:: text_length
325 continue
326 ! 空 pattern は空文字列に適合
327 if (len(pattern) <= 0) then
328 length = 0
329 start = 1
330 return
331 endif
332 ! メタキャラクタの認識
333 allocate(ipattern(len(pattern) + 2))
334 call preprocess_pattern(pattern, ipattern)
335 ! 頭寄せ指定のある場合
336 if (ipattern(1) == sym_headfix) then
337 start = 1
338 call match_here(ipattern(2: ), text, length)
339 if (length < 0) goto 995
340 goto 999
341 endif
342 ! 最左原理
343 text_length = len(text)
344 do, start = 1, text_length + 1
345 call match_here(ipattern, text(start:text_length), length)
346 if (length >= 0) goto 999
347 end do
348 ! みつからない場合
349995 continue
350 start = 0
351 length = -1
352999 continue
353 deallocate(ipattern)
354 end subroutine match
355
356end module dc_regex
シンプルな正規表現関数 'match' を提供します.
Definition dc_regex.f90:16
subroutine, public match(pattern, text, start, length)
Definition dc_regex.f90:267