| Class | String |
| In: |
lib/feedparser/textconverters.rb
lib/feedparser/text-output.rb |
| Parent: | Object |
This class provides various converters
| MY_ENTITIES | = | {} |
# File lib/feedparser/textconverters.rb, line 17
17: def escape_html
18: r = self.gsub('&', '&')
19: r = r.gsub('<', '<')
20: r = r.gsub('>', '>')
21: r
22: end
returns true if the text contains escaped HTML (with HTML entities). used by String#text2html
# File lib/feedparser/textconverters.rb, line 13
13: def escaped_html?
14: return (self =~ /<img src=/) || (self =~ /<a href=/) || (self =~ /<br(\/| \/|)>/) || (self =~ /<p>/)
15: end
Convert an HTML text to plain text
# File lib/feedparser/text-output.rb, line 7
7: def html2text
8: text = self.clone
9: # parse HTML
10: p = FeedParser::HTML2TextParser::new(true)
11: p.feed(text)
12: p.close
13: text = p.savedata
14: # remove leading and trailing whilespace
15: text.gsub!(/\A\s*/m, '')
16: text.gsub!(/\s*\Z/m, '')
17: # remove whitespace around \n
18: text.gsub!(/ *\n/m, "\n")
19: text.gsub!(/\n */m, "\n")
20: # and duplicates \n
21: text.gsub!(/\n\n+/m, "\n\n")
22: text
23: end
is this text HTML ? search for tags. used by String#text2html
# File lib/feedparser/textconverters.rb, line 8
8: def html?
9: return (self =~ /<p>/) || (self =~ /<\/p>/) || (self =~ /<br>/) || (self =~ /<br\s*(\/)?\s*>/) || (self =~ /<\/a>/) || (self =~ /<img.*>/)
10: end
Remove white space around the text
# File lib/feedparser/textconverters.rb, line 83
83: def rmWhiteSpace!
84: return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'')
85: end
convert text to HTML
# File lib/feedparser/textconverters.rb, line 39
39: def text2html(feed)
40: text = self.clone
41: if text.html?
42: # do nothing
43: elsif text.escaped_html?
44: text = text.unescape_html
45: else
46: # paragraphs
47: text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>')
48: text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>")
49: # uris
50: text.gsub!(/(#{URI::regexp(['http','ftp','https'])})/,
51: '<a href="\1">\1</a>')
52: end
53: # Handle broken hrefs in <a> and <img>
54: if feed and feed.link
55: text.gsub!(/(\s(src|href)=['"])([^'"]*)(['"])/) do |m|
56: begin
57: first, url, last = $1, $3, $4
58: if (url =~ /^\s*\w+:\/\//) or (url =~ /^\s*\w+:\w/)
59: m
60: elsif url =~ /^\//
61: (first + feed.link.split(/\//)[0..2].join('/') + url + last)
62: else
63: t = feed.link.split(/\//)
64: if t.length == 3 # http://toto with no trailing /
65: (first + feed.link + '/' + url + last)
66: else
67: if feed.link =~ /\/$/
68: (first + feed.link + url + last)
69: else
70: (first + t[0...-1].join('/') + '/' + url + last)
71: end
72: end
73: end
74: rescue
75: m
76: end
77: end
78: end
79: text
80: end
Convert a text in inputenc to a text in UTF8 must take care of wrong input locales
# File lib/feedparser/textconverters.rb, line 89
89: def toUTF8(inputenc)
90: if inputenc.downcase != 'utf-8'
91: # it is said it is not UTF-8. Ensure it is REALLY not UTF-8
92: begin
93: if self.unpack('U*').pack('U*') == self
94: return self
95: end
96: rescue
97: # do nothing
98: end
99: begin
100: return self.unpack('C*').pack('U*')
101: rescue
102: return self #failsafe solution. but a dirty one :-)
103: end
104: else
105: return self
106: end
107: end
un-escape HTML in the text. used by String#text2html
# File lib/feedparser/textconverters.rb, line 30
30: def unescape_html
31: r = self
32: MY_ENTITIES.each do |k, v|
33: r = r.gsub(k, v)
34: end
35: r
36: end