DISPATCH
tokenizer.py
1 """Fortran namelist tokenizer.
2 
3 :copyright: Copyright 2017 Marshall Ward, see AUTHORS for details.
4 :license: Apache License, Version 2.0, see LICENSE for details.
5 """
6 import string
7 import itertools
8 
9 
10 class Tokenizer(object):
11  """Fortran namelist tokenizer."""
12 
13  # I don't use these two
14  special_chars = ' =+-*/\\()[]{},.:;!"%&~<>?\'`|$#@' # Table 3.1
15  lexical_tokens = '=+-*/()[],.:;%&<>' # Meaningful?
16 
17  # I only use this one
18  punctuation = '=+-*/\\()[]{},:;%&~<>?`|$#@' # Unhandled Table 3.1 tokens
19 
20  def __init__(self):
21  """Initialise the tokenizer."""
22  self.characters = None
23  self.prior_char = None
24  self.char = None
25  self.idx = None
26  self.whitespace = string.whitespace.replace('\n', '')
27  self.prior_delim = None
28 
29  # Set to true if inside a namelist group
30  self.group_token = None
31 
32  def parse(self, line):
33  """Tokenize a line of Fortran source."""
34  tokens = []
35 
36  self.idx = -1 # Bogus value to ensure idx = 0 after first iteration
37  self.characters = iter(line)
38  self.update_chars()
39 
40  while self.char != '\n':
41 
42  # Update namelist group status
43  if self.char in ('&', '$'):
44  self.group_token = self.char
45 
46  if self.group_token and (
47  (self.group_token, self.char) in (('&', '/'), ('$', '$'))):
48  self.group_token = False
49 
50  word = ''
51  if self.char in self.whitespace:
52  while self.char in self.whitespace:
53  word += self.char
54  self.update_chars()
55 
56  elif self.char in ('!', '#') or self.group_token is None:
57  # Abort the iteration and build the comment token
58  word = line[self.idx:-1]
59  self.char = '\n'
60 
61  elif self.char in '"\'' or self.prior_delim:
62  word = self.parse_string()
63 
64  elif self.char.isalpha():
65  word = self.parse_name(line)
66 
67  elif self.char.isdigit() or self.char == '-':
68  word = self.parse_numeric()
69 
70  elif self.char == '.':
71  self.update_chars()
72  if self.char.isdigit():
73  frac = self.parse_numeric()
74  word = '.' + frac
75  else:
76  word = '.'
77  while self.char.isalpha():
78  word += self.char
79  self.update_chars()
80  if self.char == '.':
81  word += self.char
82  self.update_chars()
83 
84  elif self.char in Tokenizer.punctuation:
85  word = self.char
86  self.update_chars()
87 
88  else:
89  # This should never happen
90  raise ValueError
91 
92  tokens.append(word)
93 
94  return tokens
95 
96  def parse_name(self, line):
97  """Tokenize a Fortran name, such as a variable or subroutine."""
98  end = self.idx
99  for char in line[self.idx:]:
100  if not char.isalnum() and char not in '\'"_':
101  break
102  end += 1
103 
104  word = line[self.idx:end]
105 
106  self.idx = end - 1
107  # Update iterator, minus first character which was already read
108  self.characters = itertools.islice(self.characters, len(word) - 1,
109  None)
110  self.update_chars()
111 
112  return word
113 
114  def parse_string(self):
115  """Tokenize a Fortran string."""
116  word = ''
117 
118  if self.prior_delim:
119  delim = self.prior_delim
120  self.prior_delim = None
121  else:
122  delim = self.char
123  word += self.char
124  self.update_chars()
125 
126  while True:
127  if self.char == delim:
128  # Check for escaped delimiters
129  self.update_chars()
130  if self.char == delim:
131  word += 2 * delim
132  self.update_chars()
133  else:
134  word += delim
135  break
136  elif self.char == '\n':
137  self.prior_delim = delim
138  break
139  else:
140  word += self.char
141  self.update_chars()
142 
143  return word
144 
145  def parse_numeric(self):
146  """Tokenize a Fortran numerical value."""
147  word = ''
148  frac = False
149 
150  if self.char == '-':
151  word += self.char
152  self.update_chars()
153 
154  while self.char.isdigit() or (self.char == '.' and not frac):
155  # Only allow one decimal point
156  if self.char == '.':
157  frac = True
158  word += self.char
159  self.update_chars()
160 
161  # Check for float exponent
162  if self.char in 'eEdD':
163  word += self.char
164  self.update_chars()
165 
166  if self.char in '+-':
167  word += self.char
168  self.update_chars()
169  while self.char.isdigit():
170  word += self.char
171  self.update_chars()
172 
173  return word
174 
175  def update_chars(self):
176  """Update the current charters in the tokenizer."""
177  # NOTE: We spoof non-Unix files by returning '\n' on StopIteration
178  self.prior_char, self.char = self.char, next(self.characters, '\n')
179  self.idx += 1
def parse(self, line)
Definition: tokenizer.py:32
def parse_name(self, line)
Definition: tokenizer.py:96