Skip to content

Commit

Permalink
Refactor HTML parsing as generic function in openid_utils
Browse files Browse the repository at this point in the history
  • Loading branch information
brendonh committed Sep 18, 2009
1 parent a90947c commit b1443c3
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 25 deletions.
108 changes: 108 additions & 0 deletions src/openid_utils.erl
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
%%%-------------------------------------------------------------------
%%% File : openid_utils.erl
%%% Author : Brendon Hogger <[email protected]>
%%% Description :
%%%
%%% Created : 18 Sep 2009 by Brendon Hogger <[email protected]>
%%%-------------------------------------------------------------------
-module(openid_utils).

-export([get_tags/2, get_tags/4]).

-include("openid.hrl").

get_tags(Content, Tag) ->
find_tags(Content, {[], Tag, none, none}).

get_tags(Content, Tag, AttrName, AttrVal) ->
find_tags(Content, {[], Tag, string:to_lower(AttrName), string:to_lower(AttrVal)}).

find_tags("</head>" ++ _Rest, {Buffer,_,_,_}) -> lists:reverse(Buffer);
find_tags("", {Buffer,_,_,_}) -> lists:reverse(Buffer);
find_tags("<" ++ Rest, {_,Tag,_,_}=State) -> read_tag(Rest, Tag, State);
find_tags([_|Rest], State) -> find_tags(Rest, State).

read_tag([$\s|Rest], Tag, State)-> read_tag(Rest, Tag, State);
read_tag([$\r|Rest], Tag, State)-> read_tag(Rest, Tag, State);
read_tag([$\n|Rest], Tag, State)-> read_tag(Rest, Tag, State);
read_tag([$\t|Rest], Tag, State)-> read_tag(Rest, Tag, State);
read_tag([], _, State) -> find_tags("", State);
read_tag(Rest, [], State) -> get_tag_content(Rest, State);
read_tag([C1|Rest], [C2|TagRest]=Tag, State) ->
case string:to_lower(C1) == string:to_lower(C2) of
true -> read_tag(Rest, TagRest, State);
false-> read_tag(Rest, Tag, State)
end;
read_tag(Rest, _, State) -> skip_tag(Rest, State).

skip_tag([$>|Rest], State) -> find_tags(Rest, State);
skip_tag("", State) -> find_tags("", State);
skip_tag([_|Rest], State) -> skip_tag(Rest, State).


get_tag_content(Rest, State) ->
{Content, Tail} = get_raw_content(Rest, []),
case re:run(string:to_lower(Content),
"([a-z0-9-]+)\s*=\s*[\"'](.*?)[\"']", % "
[{capture, all_but_first, list}, global]) of
{match, Bits} -> check_attrs([{string:to_lower(K),V} || [K,V] <- Bits], Tail, State);
_ -> find_tags(Tail, State)
end.

get_raw_content(">" ++ Tail, Content) -> {lists:reverse(Content), Tail};
get_raw_content([Char|Rest], Bits) -> get_raw_content(Rest, [Char|Bits]).

check_attrs(PropList, Tail, {Buffer,Tag,none,none}) ->
find_tags(Tail, {[PropList|Buffer],Tag,none,none});
check_attrs(PropList, Tail, {_,_,Key,Val}=State) ->
case ?GVD(Key, PropList, none) of
none -> find_tags(Tail, State);
IVal -> check_val(string:to_lower(IVal), Val, PropList, Tail, State)
end.

check_val(V, V, PropList, Tail, {Buffer,Tag,Key,Val})->
find_tags(Tail, {[PropList|Buffer],Tag,Key,Val});
check_val(_, _, _, Tail, State) ->
find_tags(Tail, State).



%% 30> openid_utils:get_tags(S, "meta").
%% [[{"http-equiv","content-type"},
%% {"content","text/html; charset=utf-8"}],
%% [{"name","mssmarttagspreventparsing"},{"content","true"}],
%% [{"name","generator"},{"content","blogger"}]]
%% 31> openid_utils:get_tags(S, "link", "rel", "icon").
%% [[{"rel","icon"},
%% {"type","image/vnd.microsoft.icon"},
%% {"href","http://www.blogger.com/favicon.ico"}]]
%% 32> openid_utils:get_tags(S, "link").
%% [[{"rel","icon"},
%% {"type","image/vnd.microsoft.icon"},
%% {"href","http://www.blogger.com/favicon.ico"}],
%% [{"rel","alternate"},
%% {"type","application/atom+xml"},
%% {"title","brend - atom"},
%% {"href","http://brend.taizilla.com/atom.xml"}],
%% [{"rel","alternate"},
%% {"type","application/rss+xml"},
%% {"title","brend - rss"},
%% {"href","http://brend.taizilla.com/rss.xml"}],
%% [{"rel","service.post"},
%% {"type","application/atom+xml"},
%% {"title","brend - atom"},
%% {"href",
%% "http://www.blogger.com/feeds/1426264525662754834/posts/default"}],
%% [{"rel","edituri"},
%% {"type","application/rsd+xml"},
%% {"title","rsd"},
%% {"href",
%% "http://www.blogger.com/rsd.g?blogid=1426264525662754834"}],
%% [{"rel","stylesheet"},
%% {"type","text/css"},
%% {"href",
%% "http://www.blogger.com/static/v1/v-css/3727950723-blog_controls.css"}],
%% [{"rel","stylesheet"},
%% {"type","text/css"},
%% {"href",
%% "http://www.blogger.com/dyn-css/authorization.css?targetblogid=1426264525662754834&zx=2aeefe4a-f5eb-4cc0-a761-58f936965e98"}]]
57 changes: 32 additions & 25 deletions src/yadis.erl
Original file line number Diff line number Diff line change
Expand Up @@ -101,32 +101,39 @@ get_descriptor_url(Headers, Body) when is_list(Headers) ->
URL -> URL
end.


get_descriptor_url("<meta" ++ Rest) -> get_meta(Rest);
get_descriptor_url("</head>" ++ _Rest) -> none;
get_descriptor_url("") -> none;
get_descriptor_url([_|Rest]) ->
get_descriptor_url(Rest).


get_meta(Rest) ->
Content = get_meta_content(Rest, []),
case re:run(string:to_lower(Content),
"([a-z0-9-]+)\s*=\s*[\"'](.*?)[\"']", % "
[{capture, all_but_first, list}, global]) of
{match, Bits} -> check_meta([{K,V} || [K,V] <- Bits], Rest);
_ -> get_descriptor_url(Rest)
get_descriptor_url(Body) ->
case openid_utils:get_tags(Body, "meta", "http-equiv", "x-xrds-location") of
[] -> none;
[Tag|_] -> ?GVD("content", Tag, none)
end.

check_meta(PropList, Rest) ->
case ?GVD("http-equiv", PropList, none) of
"x-xrds-location" -> ?GVD("content", PropList, none);
_ -> get_descriptor_url(Rest)
end.


get_meta_content(">" ++ _Rest, Content) -> lists:reverse(Content);
get_meta_content([Char|Rest], Bits) -> get_meta_content(Rest, [Char|Bits]).



%% get_descriptor_url("<meta" ++ Rest) -> get_meta(Rest);
%% get_descriptor_url("</head>" ++ _Rest) -> none;
%% get_descriptor_url("") -> none;
%% get_descriptor_url([_|Rest]) ->
%% get_descriptor_url(Rest).


%% get_meta(Rest) ->
%% Content = get_meta_content(Rest, []),
%% case re:run(string:to_lower(Content),
%% "([a-z0-9-]+)\s*=\s*[\"'](.*?)[\"']", % "
%% [{capture, all_but_first, list}, global]) of
%% {match, Bits} -> check_meta([{K,V} || [K,V] <- Bits], Rest);
%% _ -> get_descriptor_url(Rest)
%% end.

%% check_meta(PropList, Rest) ->
%% case ?GVD("http-equiv", PropList, none) of
%% "x-xrds-location" -> ?GVD("content", PropList, none);
%% _ -> get_descriptor_url(Rest)
%% end.


%% get_meta_content(">" ++ _Rest, Content) -> lists:reverse(Content);
%% get_meta_content([Char|Rest], Bits) -> get_meta_content(Rest, [Char|Bits]).


%% ------------------------------------------------------------
Expand Down

0 comments on commit b1443c3

Please sign in to comment.