usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyo000064400000151107147205604130021270 0ustar00 abc@`sddlmZmZmZddlmZddlmZddl m Z ddl m Z ddl m Z m Z ddl mZmZmZdd l mZmZdd l mZdd lmZdd lmZee Zd efdYZdS(i(tabsolute_importtdivisiontunicode_literals(tunichr(tdequei(tspaceCharacters(tentities(t asciiLetterstasciiUpper2Lower(tdigitst hexDigitstEOF(t tokenTypest tagTokenTypes(treplacementCharacters(tHTMLInputStream(tTriet HTMLTokenizercB`seZdZdJdZdZdZdJedZdZ dZ dZ dZ d Z d Zd Zd Zd ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZ dZ!dZ"dZ#d Z$d!Z%d"Z&d#Z'd$Z(d%Z)d&Z*d'Z+d(Z,d)Z-d*Z.d+Z/d,Z0d-Z1d.Z2d/Z3d0Z4d1Z5d2Z6d3Z7d4Z8d5Z9d6Z:d7Z;d8Z<d9Z=d:Z>d;Z?d<Z@d=ZAd>ZBd?ZCd@ZDdAZEdBZFdCZGdDZHdEZIdFZJdGZKdHZLdIZMRS(Ku  This class takes care of tokenizing HTML. * self.currentToken Holds the token that is currently being processed. * self.state Holds a reference to the method to be invoked... XXX * self.stream Points to HTMLInputStream object. cK`sbt|||_||_t|_g|_|j|_t|_d|_ t t |j dS(N(RtstreamtparsertFalset escapeFlagt lastFourCharst dataStatetstatetescapetNonet currentTokentsuperRt__init__(tselfRRtkwargs((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyR"s      cc`s}tg|_xg|jrxx6|jjrVitdd6|jjjdd6Vq!Wx|jrt|jjVqZWqWdS(u This is where the magic happens. We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested. u ParseErrorutypeiudataN(Rt tokenQueueRRterrorsR tpoptpopleft(R((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyt__iter__1s * c %C`st}d}|r!t}d}ng}|jj}x8||krp|tk rp|j||jj}q9Wtdj||}|tkrt|}|j jit dd6dd6i|d6d 6nd |kod kns|d kr3d }|j jit dd6dd6i|d6d 6nrd|koJdknsd|kofdknsd|kodknsd|kodkns|t ddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d g#krQ|j jit dd6dd6i|d6d 6nyt |}WnAt k r|d8}t d |d?Bt d9|d:@B}nX|d;kr|j jit dd6d<d6|jj|n|S(=uThis function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. i iuu ParseErrorutypeu$illegal-codepoint-for-numeric-entityudatau charAsIntudatavarsiiiu�iiiiiiiii iiiiiiiiiiiiiiiiiii i i i i i i i i i iiiiiiiiu;u numeric-entity-without-semicolon(R R RtcharR tappendtinttjoinRR R t frozensettchrt ValueErrortunget( RtisHextallowedtradixt charStacktct charAsIntR%tv((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pytconsumeNumberEntityAs`              *  c C`sd}|jjg}|dtks]|dtddfks]|dk rt||dkrt|jj|dn|ddkrpt}|j|jj|ddkrt}|j|jjn|r|dt ks| r"|dt kr"|jj|d|j |}q7|j jit dd 6d d 6|jj|jdd j|}nxF|dtk rtjd j|sPn|j|jjqsWy,tjd j|d }t|}Wntk rd}nX|dk r|dd kr@|j jit dd 6dd 6n|dd kr|r||tks||t ks||dkr|jj|jdd j|}q7t|}|jj|j|d j||7}nK|j jit dd 6dd 6|jj|jdd j|}|r[|jd ddc|7u ParseErroru'expected-tag-name-but-got-right-bracketu Charactersu<>u?u'expected-tag-name-but-got-question-markuexpected-tag-nameu<(RR%tmarkupDeclarationOpenStateRtcloseTagOpenStateRR RRt tagNameStateR R&RR,tbogusCommentStateR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRHis6      "   " cC`s?|jj}|tkrSitdd6|d6gd6td6|_|j|_n|dkr|jj itdd6dd6|j |_n|t kr|jj itdd6d d6|jj itd d6d d6|j |_nL|jj itdd6d d6i|d6d 6|jj ||j |_tS(NuEndTagutypeunameudatau selfClosingu>u ParseErroru*expected-closing-tag-but-got-right-bracketu expected-closing-tag-but-got-eofu Charactersuu ParseErrorutypeueof-in-tag-nameudatau/uuinvalid-codepointunameu�(RR%RtbeforeAttributeNameStateRRFR R R&R RtselfClosingStartTagStateRR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRVs"        cC`su|jj}|dkr3d|_|j|_n>|jjitdd6dd6|jj||j |_t S(Nu/uu Charactersutypeu|jjitdd6dd6|jj ||j |_t S(Nu Charactersutypeuu Charactersu|jjitdd6dd6|jj||j |_t S(Nu/uu Charactersutypeu|jjitdd6dd6|jj ||j |_t S(Nu Charactersutypeuu Charactersu|jjitdd6dd6|jj ||j |_t S( Nu/uu!u Charactersutypeu|jjitdd6dd6|jj ||j |_t S(Nu Charactersutypeuu Charactersuuu ParseErroruinvalid-codepointu�( RR%R R&R RgRRRRhR RR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRes& %  "    " cC`s|jj}|dkr3d|_|j|_n|tkr}|jjitdd6d|d6||_|j |_n>|jjitdd6dd6|jj ||j |_t S(Nu/uu Charactersutypeu|jjitdd6dd6|jj ||j |_t S(Nu Charactersutypeuu Charactersuu Charactersutypeudatauscript(u/u>(RR%RR)R R&R RZR]tscriptDataDoubleEscapedStateRRhRR,R5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRjs" " cC`s?|jj}|dkrL|jjitdd6dd6|j|_n|dkr|jjitdd6dd6|j|_n|dkr|jjitdd6dd6|jjitdd6d d6n_|tkr|jjitdd6d d6|j |_n"|jjitdd6|d6t S( Nu-u Charactersutypeudatauuu ParseErroruinvalid-codepointu�ueof-in-script-in-script( RR%R R&R RnRRRRlR RR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRos, % " "     " cC`su|jj}|dkrU|jjitdd6dd6d|_|j|_n|jj||j |_t S(Nu/u Charactersutypeudatau( RR%R R&R RZtscriptDataDoubleEscapeEndStateRR,RlR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRn0s "  cC`s|jj}|ttdBkrz|jjitdd6|d6|jjdkrk|j |_ q|j |_ n\|t kr|jjitdd6|d6|j|7_n|jj ||j |_ tS(Nu/u>u Charactersutypeudatauscript(u/u>(RR%RR)R R&R RZR]RhRRlRR,R5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRp;s" " cC`s|jj}|tkr1|jjttnz|tkrf|jdj|dg|j|_ nE|dkr|j n,|dkr|j |_ n|dkr|j jit d d 6d d6|jdj|dg|j|_ n|d krH|j jit d d 6d d6|jdjddg|j|_ nc|tkr|j jit d d 6dd6|j|_ n&|jdj|dg|j|_ tS(Nudatauu>u/u'u"u=uu/uu ParseErrorutypeuinvalid-codepointu�u'u"uudatauu/uu ParseErrorutypeuinvalid-codepointu�u'u"uu ParseErrorutypeu.expected-attribute-value-but-got-right-bracketudatauuinvalid-codepointiiu�u=u               cC`s|jj}|dkr*|j|_n|dkrF|jdn|dkr|jjitdd6dd6|jddd cd 7u"u'u=uu"u'u=u|jj it dd6dd6|jj ||j|_t S(Nu>u/u ParseErrorutypeu$unexpected-EOF-after-attribute-valueudatau*unexpected-character-after-attribute-value(RR%RRXRRFRYR R R&R R,RR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyR{ s"        cC`s|jj}|dkr5t|jd<|jn|tkr|jjitdd6dd6|jj ||j |_ n>|jjitdd6dd6|jj ||j |_ tS(Nu>u selfClosingu ParseErrorutypeu#unexpected-EOF-after-solidus-in-tagudatau)unexpected-character-after-solidus-in-tag( RR%R5RRFR R R&R R,RRRX(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRY4s       cC`sc|jjd}|jdd}|jjitdd6|d6|jj|j|_t S(Nu>uu�uCommentutypeudata( RRItreplaceR R&R R%RRR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRWFs   cC`sB|jjg}|ddkrv|j|jj|ddkritdd6dd6|_|j|_tSnw|ddkr(t}xPdd d!d"d#d$fD]6}|j|jj|d|krt}PqqW|ritdd6dd6dd6dd6td6|_|j |_tSn|ddkr|j dk r|j j j r|j j j dj|j j jkrt}xPd dddddgD]6}|j|jj|d|krt}PqqW|r|j|_tSn|jjitdd6dd6x |r1|jj|jqW|j|_tS(%Niu-uCommentutypeuudatauduDuouOucuCutuTuyuYupuPueuEuDoctypeunameupublicIdusystemIducorrectu[uAu ParseErroruexpected-dashes-or-doctype(uduD(uouO(ucuC(utuT(uyuY(upuP(ueuE(RR%R&R RtcommentStartStateRR5RRt doctypeStateRttreet openElementst namespacetdefaultNamespacetcdataSectionStateR R,R"RW(RR0tmatchedtexpected((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRTUsR    %    cC`s1|jj}|dkr*|j|_n|dkrn|jjitdd6dd6|jdcd7uincorrect-commentueof-in-comment( RR%tcommentStartDashStateRR R&R RRR t commentStateR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyR}s(        cC`s5|jj}|dkr*|j|_n|dkrn|jjitdd6dd6|jdcd7uincorrect-commentueof-in-comment( RR%tcommentEndStateRR R&R RRR RR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRs(        cC`s|jj}|dkr*|j|_n|dkrn|jjitdd6dd6|jdcd7uu ParseErrorutypeuinvalid-codepointudatau--�u!u,unexpected-bang-after-double-dash-in-commentu-u,unexpected-dash-after-double-dash-in-commentueof-in-comment-double-dashuunexpected-char-in-commentu--( RR%R R&RRRR RtcommentEndBangStateR R5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRs6           cC`s2|jj}|dkr=|jj|j|j|_n|dkrk|jdcd7<|j|_n|dkr|jjitdd6dd6|jdcd 7<|j |_ns|t kr |jjitdd6d d6|jj|j|j|_n#|jdcd|7<|j |_t S( Nu>u-udatau--!uu ParseErrorutypeuinvalid-codepointu--!�ueof-in-comment-end-bang-state( RR%R R&RRRRR RR R5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRs(       cC`s|jj}|tkr*|j|_n|tkr|jjitdd6dd6t |j d<|jj|j |j |_n>|jjitdd6dd6|jj ||j|_t S(Nu ParseErrorutypeu!expected-doctype-name-but-got-eofudataucorrectuneed-space-after-doctype(RR%RtbeforeDoctypeNameStateRR R R&R RRRR,R5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyR~ s      cC`s?|jj}|tkrn|dkr{|jjitdd6dd6t|jd<|jj|j|j|_ n|dkr|jjitdd6dd6d |jd <|j |_ nv|t kr"|jjitdd6d d6t|jd<|jj|j|j|_ n||jd <|j |_ t S( Nu>u ParseErrorutypeu+expected-doctype-name-but-got-right-bracketudataucorrectuuinvalid-codepointu�unameu!expected-doctype-name-but-got-eof( RR%RR R&R RRRRtdoctypeNameStateR R5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRs.            cC`ss|jj}|tkrG|jdjt|jd<|j|_n(|dkr|jdjt|jd<|jj |j|j |_n|dkr|jj it dd6dd6|jdcd7<|j |_n|t kr\|jj it dd6d d6t|jd <|jdjt|jd<|jj |j|j |_n|jdc|7uu ParseErrorutypeuinvalid-codepointudatau�ueof-in-doctype-nameucorrect(RR%RRRDRtafterDoctypeNameStateRR R&RR RR RR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyR6s,       cC`s|jj}|tkrn|dkrL|jj|j|j|_n|tkrt |jd<|jj ||jjit dd6dd6|jj|j|j|_n9|dkr)t }xBd d!d"d#d$fD]+}|jj}||krt }PqqW|r|j |_t Snp|d%krt }xBd&d'd(d)d*fD]+}|jj}||krQt }PqQqQW|r|j|_t Sn|jj ||jjit dd6dd6i|d6d6t |jd<|j|_t S(+Nu>ucorrectu ParseErrorutypeueof-in-doctypeudataupuPuuuUubuBuluLuiuIucuCusuSuyuYutuTueuEumuMu*expected-space-or-right-bracket-in-doctypeudatavars(upuP(uuuU(ubuB(uluL(uiuI(ucuC(usuS(uyuY(usuS(utuT(ueuE(umuM(RR%RR R&RRRR RR,R R5tafterDoctypePublicKeywordStatetafterDoctypeSystemKeywordStatetbogusDoctypeState(RRJRR((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyROsT               cC`s|jj}|tkr*|j|_n|d krw|jjitdd6dd6|jj||j|_ny|t kr|jjitdd6dd6t |j d<|jj|j |j |_n|jj||j|_t S( Nu'u"u ParseErrorutypeuunexpected-char-in-doctypeudataueof-in-doctypeucorrect(u'u"(RR%Rt"beforeDoctypePublicIdentifierStateRR R&R R,R RRRR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRs"       cC`sg|jj}|tkrnE|dkrFd|jd<|j|_n|dkrnd|jd<|j|_n|dkr|jjit dd6dd 6t |jd <|jj|j|j |_n|t kr(|jjit dd6d d 6t |jd <|jj|j|j |_n;|jjit dd6d d 6t |jd <|j |_tS( Nu"uupublicIdu'u>u ParseErrorutypeuunexpected-end-of-doctypeudataucorrectueof-in-doctypeuunexpected-char-in-doctype(RR%RRt(doctypePublicIdentifierDoubleQuotedStateRt(doctypePublicIdentifierSingleQuotedStateR R&R RRR RR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRs4              cC`s?|jj}|dkr*|j|_n|dkrn|jjitdd6dd6|jdcd7uunexpected-end-of-doctypeucorrectueof-in-doctype( RR%t!afterDoctypePublicIdentifierStateRR R&R RRRR R5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRs*         cC`s?|jj}|dkr*|j|_n|dkrn|jjitdd6dd6|jdcd7uunexpected-end-of-doctypeucorrectueof-in-doctype( RR%RRR R&R RRRR R5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRs*         cC`s|jj}|tkr*|j|_nZ|dkrX|jj|j|j|_n,|dkr|jjit dd6dd6d|jd<|j |_n|d kr|jjit dd6dd6d|jd<|j |_n|t krI|jjit dd6d d6t |jd <|jj|j|j|_n;|jjit dd6dd6t |jd <|j|_tS( Nu>u"u ParseErrorutypeuunexpected-char-in-doctypeudatauusystemIdu'ueof-in-doctypeucorrect(RR%Rt-betweenDoctypePublicAndSystemIdentifiersStateRR R&RRR t(doctypeSystemIdentifierDoubleQuotedStatet(doctypeSystemIdentifierSingleQuotedStateR RRR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRs6              cC`s8|jj}|tkrn|dkrL|jj|j|j|_n|dkrtd|jd<|j|_n|dkrd|jd<|j |_n|t kr|jjit dd6dd 6t |jd <|jj|j|j|_n;|jjit dd6d d 6t |jd <|j |_tS( Nu>u"uusystemIdu'u ParseErrorutypeueof-in-doctypeudataucorrectuunexpected-char-in-doctype(RR%RR R&RRRRRR R RRR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRs.            cC`s|jj}|tkr*|j|_n|d krw|jjitdd6dd6|jj||j|_ny|t kr|jjitdd6dd6t |j d<|jj|j |j |_n|jj||j|_t S( Nu'u"u ParseErrorutypeuunexpected-char-in-doctypeudataueof-in-doctypeucorrect(u'u"(RR%Rt"beforeDoctypeSystemIdentifierStateRR R&R R,R RRRR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRs"       cC`sg|jj}|tkrnE|dkrFd|jd<|j|_n|dkrnd|jd<|j|_n|dkr|jjit dd6dd 6t |jd <|jj|j|j |_n|t kr(|jjit dd6d d 6t |jd <|jj|j|j |_n;|jjit dd6dd 6t |jd <|j |_tS( Nu"uusystemIdu'u>u ParseErrorutypeuunexpected-char-in-doctypeudataucorrectueof-in-doctype(RR%RRRRRR R&R RRR RR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyR/s4              cC`s?|jj}|dkr*|j|_n|dkrn|jjitdd6dd6|jdcd7uunexpected-end-of-doctypeucorrectueof-in-doctype( RR%t!afterDoctypeSystemIdentifierStateRR R&R RRRR R5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRLs*         cC`s?|jj}|dkr*|j|_n|dkrn|jjitdd6dd6|jdcd7uunexpected-end-of-doctypeucorrectueof-in-doctype( RR%RRR R&R RRRR R5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRds*         cC`s|jj}|tkrn|dkrL|jj|j|j|_n|tkr|jjit dd6dd6t |jd<|jj|j|j|_n.|jjit dd6dd6|j |_t S(Nu>u ParseErrorutypeueof-in-doctypeudataucorrectuunexpected-char-in-doctype( RR%RR R&RRRR R RRR5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyR|s        cC`s|jj}|dkr=|jj|j|j|_n>|tkr{|jj||jj|j|j|_nt S(Nu>( RR%R R&RRRR R,R5(RRJ((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRs  cC`sNg}xtr|j|jjd|j|jjd|jj}|tkr`Pq |dddkr|dd |d|j jit dd 6|d 6n|j |_ tS(Nu]u>iiu]]uuiu ParseErrorutypeuinvalid-codepointudatau�u Characters(R5R&RRIR%R R(tcounttrangeR R R|RR(RRJR%t nullCountRw((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRs.    N(Nt__name__t __module__t__doc__RRR$R4RRBRCRFRRGRNRLRPRRRSRHRURVRMR[R\ROR_R`RQRaRcRbRdRhRfReRgRiRkRjRlRmRoRnRpRXRqRsRrRxRzRyR{RYRWRTR}RRRRRR~RRRRRRRRRRRRRRRR(((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyRs    HP          #                  6 "       -          3            N(t __future__RRRtpip._vendor.sixRR*t collectionsRt constantsRRRRR R R R R Rt _inputstreamRt_trieRR6tobjectR(((sC/usr/lib/python2.7/site-packages/pip/_vendor/html5lib/_tokenizer.pyts