Lexical analyzer for Ruby source
# File rdoc/parsers/parse_rb.rb, line 443
def RubyLex.debug?
false
end
# File rdoc/parsers/parse_rb.rb, line 447
def initialize(content)
lex_init
@reader = BufferedReader.new(content)
@exp_line_no = @line_no = 1
@base_char_no = 0
@indent = 0
@ltype = nil
@quoted = nil
@lex_state = EXPR_BEG
@space_seen = false
@continue = false
@line = ""
@skip_space = false
@read_auto_clean_up = false
@exception_on_syntax_error = true
end
# File rdoc/parsers/parse_rb.rb, line 480
def char_no
@reader.column
end
# File rdoc/parsers/parse_rb.rb, line 484
def get_read
@reader.get_read
end
# File rdoc/parsers/parse_rb.rb, line 492
def getc_of_rests
@reader.getc_already_read
end
# File rdoc/parsers/parse_rb.rb, line 496
def gets
c = getc or return
l = ""
begin
l.concat c unless c == "\r"
break if c == "\n"
end while c = getc
l
end
# File rdoc/parsers/parse_rb.rb, line 1269
def identify_comment
@ltype = "#"
comment = "#"
while ch = getc
if ch == "\\"
ch = getc
if ch == "\n"
ch = " "
else
comment << "\\"
end
else
if ch == "\n"
@ltype = nil
ungetc
break
end
end
comment << ch
end
return Token(TkCOMMENT).set_text(comment)
end
# File rdoc/parsers/parse_rb.rb, line 964
def identify_gvar
@lex_state = EXPR_END
str = "$"
tk = case ch = getc
when /[~_*$?!@\/\\;,=:<>".]/ #"
str << ch
Token(TkGVAR, str)
when "-"
str << "-" << getc
Token(TkGVAR, str)
when "&", "`", "'", "+"
str << ch
Token(TkBACK_REF, str)
when /[1-9]/
str << ch
while (ch = getc) =~ /[0-9]/
str << ch
end
ungetc
Token(TkNTH_REF)
when /\w/
ungetc
ungetc
return identify_identifier
else
ungetc
Token("$")
end
tk.set_text(str)
end
# File rdoc/parsers/parse_rb.rb, line 1074
def identify_here_document
ch = getc
if ch == "-"
ch = getc
indent = true
end
if /['"`]/ =~ ch # '
lt = ch
quoted = ""
while (c = getc) && c != lt
quoted.concat c
end
else
lt = '"'
quoted = ch.dup
while (c = getc) && c =~ /\w/
quoted.concat c
end
ungetc
end
ltback, @ltype = @ltype, lt
reserve = ""
while ch = getc
reserve << ch
if ch == "\\" #"
ch = getc
reserve << ch
elsif ch == "\n"
break
end
end
str = ""
while (l = gets)
l.chomp!
l.strip! if indent
break if l == quoted
str << l.chomp << "\n"
end
@reader.divert_read_from(reserve)
@ltype = ltback
@lex_state = EXPR_END
Token(Ltype2Token[lt], str).set_text(str.dump)
end
# File rdoc/parsers/parse_rb.rb, line 999
def identify_identifier
token = ""
token.concat getc if peek(0) =~ /[$@]/
token.concat getc if peek(0) == "@"
while (ch = getc) =~ /\w|_/
print ":", ch, ":" if RubyLex.debug?
token.concat ch
end
ungetc
if ch == "!" or ch == "?"
token.concat getc
end
# fix token
# $stderr.puts "identifier - #{token}, state = #@lex_state"
case token
when /^\$/
return Token(TkGVAR, token).set_text(token)
when /^\@/
@lex_state = EXPR_END
return Token(TkIVAR, token).set_text(token)
end
if @lex_state != EXPR_DOT
print token, "\n" if RubyLex.debug?
token_c, *trans = TkReading2Token[token]
if token_c
# reserved word?
if (@lex_state != EXPR_BEG &&
@lex_state != EXPR_FNAME &&
trans[1])
# modifiers
token_c = TkSymbol2Token[trans[1]]
@lex_state = trans[0]
else
if @lex_state != EXPR_FNAME
if ENINDENT_CLAUSE.include?(token)
@indent += 1
elsif DEINDENT_CLAUSE.include?(token)
@indent -= 1
end
@lex_state = trans[0]
else
@lex_state = EXPR_END
end
end
return Token(token_c, token).set_text(token)
end
end
if @lex_state == EXPR_FNAME
@lex_state = EXPR_END
if peek(0) == '='
token.concat getc
end
elsif @lex_state == EXPR_BEG || @lex_state == EXPR_DOT
@lex_state = EXPR_ARG
else
@lex_state = EXPR_END
end
if token[0, 1] =~ /[A-Z]/
return Token(TkCONSTANT, token).set_text(token)
elsif token[token.size - 1, 1] =~ /[!?]/
return Token(TkFID, token).set_text(token)
else
return Token(TkIDENTIFIER, token).set_text(token)
end
end
# File rdoc/parsers/parse_rb.rb, line 1142
def identify_number(start)
str = start.dup
if start == "+" or start == "-" or start == ""
start = getc
str << start
end
@lex_state = EXPR_END
if start == "0"
if peek(0) == "x"
ch = getc
str << ch
match = /[0-9a-f_]/
else
match = /[0-7_]/
end
while ch = getc
if ch !~ match
ungetc
break
else
str << ch
end
end
return Token(TkINTEGER).set_text(str)
end
type = TkINTEGER
allow_point = TRUE
allow_e = TRUE
while ch = getc
case ch
when /[0-9_]/
str << ch
when allow_point && "."
type = TkFLOAT
if peek(0) !~ /[0-9]/
ungetc
break
end
str << ch
allow_point = false
when allow_e && "e", allow_e && "E"
str << ch
type = TkFLOAT
if peek(0) =~ /[+-]/
str << getc
end
allow_e = false
allow_point = false
else
ungetc
break
end
end
Token(type).set_text(str)
end
# File rdoc/parsers/parse_rb.rb, line 1123
def identify_quotation(initial_char)
ch = getc
if lt = PERCENT_LTYPE[ch]
initial_char += ch
ch = getc
elsif ch =~ /\W/
lt = "\""
else
RubyLex.fail SyntaxError, "unknown type of %string ('#{ch}')"
end
# if ch !~ /\W/
# ungetc
# next
# end
#@ltype = lt
@quoted = ch unless @quoted = PERCENT_PAREN[ch]
identify_string(lt, @quoted, ch, initial_char)
end
# File rdoc/parsers/parse_rb.rb, line 1204
def identify_string(ltype, quoted = ltype, opener=nil, initial_char = nil)
@ltype = ltype
@quoted = quoted
subtype = nil
str = ""
str << initial_char if initial_char
str << (opener||quoted)
nest = 0
begin
while ch = getc
str << ch
if @quoted == ch
if nest == 0
break
else
nest -= 1
end
elsif opener == ch
nest += 1
elsif @ltype != "'" && @ltype != "]" and ch == "#"
ch = getc
if ch == "{"
subtype = true
str << ch << skip_inner_expression
else
ungetc(ch)
end
elsif ch == '\\' #'
str << read_escape
end
end
if @ltype == "/"
if peek(0) =~ /i|o|n|e|s/
str << getc
end
end
if subtype
Token(DLtype2Token[ltype], str)
else
Token(Ltype2Token[ltype], str)
end.set_text(str)
ensure
@ltype = nil
@quoted = nil
@lex_state = EXPR_END
end
end
# File rdoc/parsers/parse_rb.rb, line 519
def lex
until (((tk = token).kind_of?(TkNL) || tk.kind_of?(TkEND_OF_SCRIPT)) &&
!@continue or
tk.nil?)
end
line = get_read
if line == "" and tk.kind_of?(TkEND_OF_SCRIPT) || tk.nil?
nil
else
line
end
end
# File rdoc/parsers/parse_rb.rb, line 589
def lex_init()
@OP = SLex.new
@OP.def_rules("\0", "\004", "\032") do |chars, io|
Token(TkEND_OF_SCRIPT).set_text(chars)
end
@OP.def_rules(" ", "\t", "\f", "\r", "\13") do |chars, io|
@space_seen = TRUE
while (ch = getc) =~ /[ \t\f\r\13]/
chars << ch
end
ungetc
Token(TkSPACE).set_text(chars)
end
@OP.def_rule("#") do
|op, io|
identify_comment
end
@OP.def_rule("=begin", proc{@prev_char_no == 0 && peek(0) =~ /\s/}) do
|op, io|
str = op
@ltype = "="
begin
line = ""
begin
ch = getc
line << ch
end until ch == "\n"
str << line
end until line =~ /^=end/
ungetc
@ltype = nil
if str =~ /\A=begin\s+rdoc/i
str.sub!(/\A=begin.*\n/, '')
str.sub!(/^=end.*/m, '')
Token(TkCOMMENT).set_text(str)
else
Token(TkRD_COMMENT)#.set_text(str)
end
end
@OP.def_rule("\n") do
print "\\n\n" if RubyLex.debug?
case @lex_state
when EXPR_BEG, EXPR_FNAME, EXPR_DOT
@continue = TRUE
else
@continue = FALSE
@lex_state = EXPR_BEG
end
Token(TkNL).set_text("\n")
end
@OP.def_rules("*", "**",
"!", "!=", "!~",
"=", "==", "===",
"=~", "<=>",
"<", "<=",
">", ">=", ">>") do
|op, io|
@lex_state = EXPR_BEG
Token(op).set_text(op)
end
@OP.def_rules("<<") do
|op, io|
tk = nil
if @lex_state != EXPR_END && @lex_state != EXPR_CLASS &&
(@lex_state != EXPR_ARG || @space_seen)
c = peek(0)
if /[-\w_\"\'\`]/ =~ c
tk = identify_here_document
end
end
if !tk
@lex_state = EXPR_BEG
tk = Token(op).set_text(op)
end
tk
end
@OP.def_rules("'", '"') do
|op, io|
identify_string(op)
end
@OP.def_rules("`") do
|op, io|
if @lex_state == EXPR_FNAME
Token(op).set_text(op)
else
identify_string(op)
end
end
@OP.def_rules('?') do
|op, io|
if @lex_state == EXPR_END
@lex_state = EXPR_BEG
Token(TkQUESTION).set_text(op)
else
ch = getc
if @lex_state == EXPR_ARG && ch !~ /\s/
ungetc
@lex_state = EXPR_BEG;
Token(TkQUESTION).set_text(op)
else
str = op
str << ch
if (ch == '\\') #'
str << read_escape
end
@lex_state = EXPR_END
Token(TkINTEGER).set_text(str)
end
end
end
@OP.def_rules("&", "&&", "|", "||") do
|op, io|
@lex_state = EXPR_BEG
Token(op).set_text(op)
end
@OP.def_rules("+=", "-=", "*=", "**=",
"&=", "|=", "^=", "<<=", ">>=", "||=", "&&=") do
|op, io|
@lex_state = EXPR_BEG
op =~ /^(.*)=$/
Token(TkOPASGN, $1).set_text(op)
end
@OP.def_rule("+@", proc{@lex_state == EXPR_FNAME}) do |op, io|
Token(TkUPLUS).set_text(op)
end
@OP.def_rule("-@", proc{@lex_state == EXPR_FNAME}) do |op, io|
Token(TkUMINUS).set_text(op)
end
@OP.def_rules("+", "-") do
|op, io|
catch(:RET) do
if @lex_state == EXPR_ARG
if @space_seen and peek(0) =~ /[0-9]/
throw :RET, identify_number(op)
else
@lex_state = EXPR_BEG
end
elsif @lex_state != EXPR_END and peek(0) =~ /[0-9]/
throw :RET, identify_number(op)
else
@lex_state = EXPR_BEG
end
Token(op).set_text(op)
end
end
@OP.def_rule(".") do
@lex_state = EXPR_BEG
if peek(0) =~ /[0-9]/
ungetc
identify_number("")
else
# for obj.if
@lex_state = EXPR_DOT
Token(TkDOT).set_text(".")
end
end
@OP.def_rules("..", "...") do
|op, io|
@lex_state = EXPR_BEG
Token(op).set_text(op)
end
lex_int2
end
# File rdoc/parsers/parse_rb.rb, line 775
def lex_int2
@OP.def_rules("]", "}", ")") do
|op, io|
@lex_state = EXPR_END
@indent -= 1
Token(op).set_text(op)
end
@OP.def_rule(":") do
if @lex_state == EXPR_END || peek(0) =~ /\s/
@lex_state = EXPR_BEG
tk = Token(TkCOLON)
else
@lex_state = EXPR_FNAME;
tk = Token(TkSYMBEG)
end
tk.set_text(":")
end
@OP.def_rule("::") do
# p @lex_state.id2name, @space_seen
if @lex_state == EXPR_BEG or @lex_state == EXPR_ARG && @space_seen
@lex_state = EXPR_BEG
tk = Token(TkCOLON3)
else
@lex_state = EXPR_DOT
tk = Token(TkCOLON2)
end
tk.set_text("::")
end
@OP.def_rule("/") do
|op, io|
if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
identify_string(op)
elsif peek(0) == '='
getc
@lex_state = EXPR_BEG
Token(TkOPASGN, :/).set_text("/=") #")
elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
identify_string(op)
else
@lex_state = EXPR_BEG
Token("/").set_text(op)
end
end
@OP.def_rules("^") do
@lex_state = EXPR_BEG
Token("^").set_text("^")
end
# @OP.def_rules("^=") do
# @lex_state = EXPR_BEG
# Token(TkOPASGN, :^)
# end
@OP.def_rules(",", ";") do
|op, io|
@lex_state = EXPR_BEG
Token(op).set_text(op)
end
@OP.def_rule("~") do
@lex_state = EXPR_BEG
Token("~").set_text("~")
end
@OP.def_rule("~@", proc{@lex_state = EXPR_FNAME}) do
@lex_state = EXPR_BEG
Token("~").set_text("~@")
end
@OP.def_rule("(") do
@indent += 1
if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
@lex_state = EXPR_BEG
tk = Token(TkfLPAREN)
else
@lex_state = EXPR_BEG
tk = Token(TkLPAREN)
end
tk.set_text("(")
end
@OP.def_rule("[]", proc{@lex_state == EXPR_FNAME}) do
Token("[]").set_text("[]")
end
@OP.def_rule("[]=", proc{@lex_state == EXPR_FNAME}) do
Token("[]=").set_text("[]=")
end
@OP.def_rule("[") do
@indent += 1
if @lex_state == EXPR_FNAME
t = Token(TkfLBRACK)
else
if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
t = Token(TkLBRACK)
elsif @lex_state == EXPR_ARG && @space_seen
t = Token(TkLBRACK)
else
t = Token(TkfLBRACK)
end
@lex_state = EXPR_BEG
end
t.set_text("[")
end
@OP.def_rule("{") do
@indent += 1
if @lex_state != EXPR_END && @lex_state != EXPR_ARG
t = Token(TkLBRACE)
else
t = Token(TkfLBRACE)
end
@lex_state = EXPR_BEG
t.set_text("{")
end
@OP.def_rule('\\') do #'
if getc == "\n"
@space_seen = true
@continue = true
Token(TkSPACE).set_text("\\\n")
else
ungetc
Token("\\").set_text("\\") #"
end
end
@OP.def_rule('%') do
|op, io|
if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
identify_quotation('%')
elsif peek(0) == '='
getc
Token(TkOPASGN, "%").set_text("%=")
elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
identify_quotation('%')
else
@lex_state = EXPR_BEG
Token("%").set_text("%")
end
end
@OP.def_rule('$') do #'
identify_gvar
end
@OP.def_rule('@') do
if peek(0) =~ /[@\w_]/
ungetc
identify_identifier
else
Token("@").set_text("@")
end
end
# @OP.def_rule("def", proc{|op, io| /\s/ =~ io.peek(0)}) do
# |op, io|
# @indent += 1
# @lex_state = EXPR_FNAME
# # @lex_state = EXPR_END
# # until @rests[0] == "\n" or @rests[0] == ";"
# # rests.shift
# # end
# end
@OP.def_rule("__END__", proc{@prev_char_no == 0 && peek(0) =~ /[\r\n]/}) do
throw :eof
end
@OP.def_rule("") do
|op, io|
printf "MATCH: start %s: %s\n", op, io.inspect if RubyLex.debug?
if peek(0) =~ /[0-9]/
t = identify_number("")
elsif peek(0) =~ /[\w_]/
t = identify_identifier
end
printf "MATCH: end %s: %s\n", op, io.inspect if RubyLex.debug?
t
end
p @OP if RubyLex.debug?
end
io functions
# File rdoc/parsers/parse_rb.rb, line 476
def line_no
@reader.line_num
end
# File rdoc/parsers/parse_rb.rb, line 515
def peek(i = 0)
@reader.peek(i)
end
# File rdoc/parsers/parse_rb.rb, line 511
def peek_equal?(str)
@reader.peek_equal(str)
end
# File rdoc/parsers/parse_rb.rb, line 1292
def read_escape
res = ""
case ch = getc
when /[0-7]/
ungetc ch
3.times do
case ch = getc
when /[0-7]/
when nil
break
else
ungetc
break
end
res << ch
end
when "x"
res << ch
2.times do
case ch = getc
when /[0-9a-fA-F]/
when nil
break
else
ungetc
break
end
res << ch
end
when "M"
res << ch
if (ch = getc) != '-'
ungetc
else
res << ch
if (ch = getc) == "\\" #"
res << ch
res << read_escape
else
res << ch
end
end
when "C", "c" #, "^"
res << ch
if ch == "C" and (ch = getc) != "-"
ungetc
else
res << ch
if (ch = getc) == "\\" #"
res << ch
res << read_escape
else
res << ch
end
end
else
res << ch
end
res
end
# File rdoc/parsers/parse_rb.rb, line 1254
def skip_inner_expression
res = ""
nest = 0
while (ch = getc)
res << ch
if ch == '}'
break if nest.zero?
nest -= 1
elsif ch == '{'
nest += 1
end
end
res
end
# File rdoc/parsers/parse_rb.rb, line 533
def token
set_token_position(line_no, char_no)
begin
begin
tk = @OP.match(self)
@space_seen = tk.kind_of?(TkSPACE)
rescue SyntaxError
abort if @exception_on_syntax_error
tk = TkError.new(line_no, char_no)
end
end while @skip_space and tk.kind_of?(TkSPACE)
if @read_auto_clean_up
get_read
end
# throw :eof unless tk
p tk if $DEBUG
tk
end