lexer.go 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. package httphead
  2. import (
  3. "bytes"
  4. )
  5. // ItemType encodes type of the lexing token.
  6. type ItemType int
  7. const (
  8. // ItemUndef reports that token is undefined.
  9. ItemUndef ItemType = iota
  10. // ItemToken reports that token is RFC2616 token.
  11. ItemToken
  12. // ItemSeparator reports that token is RFC2616 separator.
  13. ItemSeparator
  14. // ItemString reports that token is RFC2616 quouted string.
  15. ItemString
  16. // ItemComment reports that token is RFC2616 comment.
  17. ItemComment
  18. // ItemOctet reports that token is octet slice.
  19. ItemOctet
  20. )
  21. // Scanner represents header tokens scanner.
  22. // See https://tools.ietf.org/html/rfc2616#section-2
  23. type Scanner struct {
  24. data []byte
  25. pos int
  26. itemType ItemType
  27. itemBytes []byte
  28. err bool
  29. }
  30. // NewScanner creates new RFC2616 data scanner.
  31. func NewScanner(data []byte) *Scanner {
  32. return &Scanner{data: data}
  33. }
  34. // Next scans for next token. It returns true on successful scanning, and false
  35. // on error or EOF.
  36. func (l *Scanner) Next() bool {
  37. c, ok := l.nextChar()
  38. if !ok {
  39. return false
  40. }
  41. switch c {
  42. case '"': // quoted-string;
  43. return l.fetchQuotedString()
  44. case '(': // comment;
  45. return l.fetchComment()
  46. case '\\', ')': // unexpected chars;
  47. l.err = true
  48. return false
  49. default:
  50. return l.fetchToken()
  51. }
  52. }
  53. // FetchUntil fetches ItemOctet from current scanner position to first
  54. // occurence of the c or to the end of the underlying data.
  55. func (l *Scanner) FetchUntil(c byte) bool {
  56. l.resetItem()
  57. if l.pos == len(l.data) {
  58. return false
  59. }
  60. return l.fetchOctet(c)
  61. }
  62. // Peek reads byte at current position without advancing it. On end of data it
  63. // returns 0.
  64. func (l *Scanner) Peek() byte {
  65. if l.pos == len(l.data) {
  66. return 0
  67. }
  68. return l.data[l.pos]
  69. }
  70. // Peek2 reads two first bytes at current position without advancing it.
  71. // If there not enough data it returs 0.
  72. func (l *Scanner) Peek2() (a, b byte) {
  73. if l.pos == len(l.data) {
  74. return 0, 0
  75. }
  76. if l.pos+1 == len(l.data) {
  77. return l.data[l.pos], 0
  78. }
  79. return l.data[l.pos], l.data[l.pos+1]
  80. }
  81. // Buffered reporst how many bytes there are left to scan.
  82. func (l *Scanner) Buffered() int {
  83. return len(l.data) - l.pos
  84. }
  85. // Advance moves current position index at n bytes. It returns true on
  86. // successful move.
  87. func (l *Scanner) Advance(n int) bool {
  88. l.pos += n
  89. if l.pos > len(l.data) {
  90. l.pos = len(l.data)
  91. return false
  92. }
  93. return true
  94. }
  95. // Skip skips all bytes until first occurence of c.
  96. func (l *Scanner) Skip(c byte) {
  97. if l.err {
  98. return
  99. }
  100. // Reset scanner state.
  101. l.resetItem()
  102. if i := bytes.IndexByte(l.data[l.pos:], c); i == -1 {
  103. // Reached the end of data.
  104. l.pos = len(l.data)
  105. } else {
  106. l.pos += i + 1
  107. }
  108. }
  109. // SkipEscaped skips all bytes until first occurence of non-escaped c.
  110. func (l *Scanner) SkipEscaped(c byte) {
  111. if l.err {
  112. return
  113. }
  114. // Reset scanner state.
  115. l.resetItem()
  116. if i := ScanUntil(l.data[l.pos:], c); i == -1 {
  117. // Reached the end of data.
  118. l.pos = len(l.data)
  119. } else {
  120. l.pos += i + 1
  121. }
  122. }
  123. // Type reports current token type.
  124. func (l *Scanner) Type() ItemType {
  125. return l.itemType
  126. }
  127. // Bytes returns current token bytes.
  128. func (l *Scanner) Bytes() []byte {
  129. return l.itemBytes
  130. }
  131. func (l *Scanner) nextChar() (byte, bool) {
  132. // Reset scanner state.
  133. l.resetItem()
  134. if l.err {
  135. return 0, false
  136. }
  137. l.pos += SkipSpace(l.data[l.pos:])
  138. if l.pos == len(l.data) {
  139. return 0, false
  140. }
  141. return l.data[l.pos], true
  142. }
  143. func (l *Scanner) resetItem() {
  144. l.itemType = ItemUndef
  145. l.itemBytes = nil
  146. }
  147. func (l *Scanner) fetchOctet(c byte) bool {
  148. i := l.pos
  149. if j := bytes.IndexByte(l.data[l.pos:], c); j == -1 {
  150. // Reached the end of data.
  151. l.pos = len(l.data)
  152. } else {
  153. l.pos += j
  154. }
  155. l.itemType = ItemOctet
  156. l.itemBytes = l.data[i:l.pos]
  157. return true
  158. }
  159. func (l *Scanner) fetchToken() bool {
  160. n, t := ScanToken(l.data[l.pos:])
  161. if n == -1 {
  162. l.err = true
  163. return false
  164. }
  165. l.itemType = t
  166. l.itemBytes = l.data[l.pos : l.pos+n]
  167. l.pos += n
  168. return true
  169. }
  170. func (l *Scanner) fetchQuotedString() (ok bool) {
  171. l.pos++
  172. n := ScanUntil(l.data[l.pos:], '"')
  173. if n == -1 {
  174. l.err = true
  175. return false
  176. }
  177. l.itemType = ItemString
  178. l.itemBytes = RemoveByte(l.data[l.pos:l.pos+n], '\\')
  179. l.pos += n + 1
  180. return true
  181. }
  182. func (l *Scanner) fetchComment() (ok bool) {
  183. l.pos++
  184. n := ScanPairGreedy(l.data[l.pos:], '(', ')')
  185. if n == -1 {
  186. l.err = true
  187. return false
  188. }
  189. l.itemType = ItemComment
  190. l.itemBytes = RemoveByte(l.data[l.pos:l.pos+n], '\\')
  191. l.pos += n + 1
  192. return true
  193. }
  194. // ScanUntil scans for first non-escaped character c in given data.
  195. // It returns index of matched c and -1 if c is not found.
  196. func ScanUntil(data []byte, c byte) (n int) {
  197. for {
  198. i := bytes.IndexByte(data[n:], c)
  199. if i == -1 {
  200. return -1
  201. }
  202. n += i
  203. if n == 0 || data[n-1] != '\\' {
  204. break
  205. }
  206. n++
  207. }
  208. return
  209. }
  210. // ScanPairGreedy scans for complete pair of opening and closing chars in greedy manner.
  211. // Note that first opening byte must not be present in data.
  212. func ScanPairGreedy(data []byte, open, close byte) (n int) {
  213. var m int
  214. opened := 1
  215. for {
  216. i := bytes.IndexByte(data[n:], close)
  217. if i == -1 {
  218. return -1
  219. }
  220. n += i
  221. // If found index is not escaped then it is the end.
  222. if n == 0 || data[n-1] != '\\' {
  223. opened--
  224. }
  225. for m < i {
  226. j := bytes.IndexByte(data[m:i], open)
  227. if j == -1 {
  228. break
  229. }
  230. m += j + 1
  231. opened++
  232. }
  233. if opened == 0 {
  234. break
  235. }
  236. n++
  237. m = n
  238. }
  239. return
  240. }
  241. // RemoveByte returns data without c. If c is not present in data it returns
  242. // the same slice. If not, it copies data without c.
  243. func RemoveByte(data []byte, c byte) []byte {
  244. j := bytes.IndexByte(data, c)
  245. if j == -1 {
  246. return data
  247. }
  248. n := len(data) - 1
  249. // If character is present, than allocate slice with n-1 capacity. That is,
  250. // resulting bytes could be at most n-1 length.
  251. result := make([]byte, n)
  252. k := copy(result, data[:j])
  253. for i := j + 1; i < n; {
  254. j = bytes.IndexByte(data[i:], c)
  255. if j != -1 {
  256. k += copy(result[k:], data[i:i+j])
  257. i = i + j + 1
  258. } else {
  259. k += copy(result[k:], data[i:])
  260. break
  261. }
  262. }
  263. return result[:k]
  264. }
  265. // SkipSpace skips spaces and lws-sequences from p.
  266. // It returns number ob bytes skipped.
  267. func SkipSpace(p []byte) (n int) {
  268. for len(p) > 0 {
  269. switch {
  270. case len(p) >= 3 &&
  271. p[0] == '\r' &&
  272. p[1] == '\n' &&
  273. OctetTypes[p[2]].IsSpace():
  274. p = p[3:]
  275. n += 3
  276. case OctetTypes[p[0]].IsSpace():
  277. p = p[1:]
  278. n++
  279. default:
  280. return
  281. }
  282. }
  283. return
  284. }
  285. // ScanToken scan for next token in p. It returns length of the token and its
  286. // type. It do not trim p.
  287. func ScanToken(p []byte) (n int, t ItemType) {
  288. if len(p) == 0 {
  289. return 0, ItemUndef
  290. }
  291. c := p[0]
  292. switch {
  293. case OctetTypes[c].IsSeparator():
  294. return 1, ItemSeparator
  295. case OctetTypes[c].IsToken():
  296. for n = 1; n < len(p); n++ {
  297. c := p[n]
  298. if !OctetTypes[c].IsToken() {
  299. break
  300. }
  301. }
  302. return n, ItemToken
  303. default:
  304. return -1, ItemUndef
  305. }
  306. }