diff options
author | Silvio Rhatto <rhatto@riseup.net> | 2013-08-27 12:35:54 -0300 |
---|---|---|
committer | Silvio Rhatto <rhatto@riseup.net> | 2013-08-27 12:35:54 -0300 |
commit | f58e4867180e1298af9684cbf787d7ff3764a31a (patch) | |
tree | 333b0a2f933f53b5f15b9ec26579e1a6b9d79838 | |
parent | bd8e556a981edc1f2e927a04f0501e29ab40f174 (diff) | |
download | httruta-f58e4867180e1298af9684cbf787d7ff3764a31a.tar.gz httruta-f58e4867180e1298af9684cbf787d7ff3764a31a.tar.bz2 |
Parsing html entities in the URLs so we get the right hash
-rwxr-xr-x | httracker | 3 | ||||
-rwxr-xr-x | lib/httracker/html.sed | 488 |
2 files changed, 490 insertions, 1 deletions
@@ -8,7 +8,8 @@ source `dirname $0`/lib/httracker/functions || exit 1 # Get URLs from feed # Thanks http://stackoverflow.com/questions/443991/how-to-parse-rss-feeds-xml-in-a-shell-script -curl -s "$FEED" | grep -o '<link>[^<]*' | grep -o "[^>]*$" > $URLS +curl -s "$FEED" | grep -o '<link>[^<]*' | grep -o "[^>]*$" \ + | `dirname $0`/lib/httracker/html.sed > $URLS if [ "$?" != "0" ]; then echo "Error downloading feed $FEED, aborting." diff --git a/lib/httracker/html.sed b/lib/httracker/html.sed new file mode 100755 index 0000000..a135ca6 --- /dev/null +++ b/lib/httracker/html.sed @@ -0,0 +1,488 @@ +#!/bin/sed -f +# Thanks https://gist.github.com/mlk/5222903/raw/6ef14f5fc351ed057d97da2fe1d83dcefc64c817/html.sed +s/ / /g +s/ / /g +s/¡/¡/g +s/¡/¡/g +s/¢/¢/g +s/¢/¢/g +s/£/£/g +s/£/£/g +s/¤/¤/g +s/¤/¤/g +s/¥/¥/g +s/¥/¥/g +s/¦/¦/g +s/¦/¦/g +s/§/§/g +s/§/§/g +s/¨/¨/g +s/¨/¨/g +s/©/©/g +s/©/©/g +s/ª/ª/g +s/ª/ª/g +s/«/«/g +s/«/«/g +s/¬/¬/g +s/¬/¬/g +s/­//g +s/­//g +s/®/®/g +s/®/®/g +s/¯/¯/g +s/¯/¯/g +s/°/°/g +s/°/°/g +s/±/±/g +s/±/±/g +s/²/²/g +s/²/²/g +s/³/³/g +s/³/³/g +s/´/´/g +s/´/´/g +s/µ/µ/g +s/µ/µ/g +s/¶/¶/g +s/¶/¶/g +s/·/·/g +s/·/·/g +s/¸/¸/g +s/¸/¸/g +s/¹/¹/g +s/¹/¹/g +s/º/º/g +s/º/º/g +s/»/»/g +s/»/»/g +s/¼/¼/g +s/¼/¼/g +s/½/½/g +s/½/½/g +s/¾/¾/g +s/¾/¾/g +s/¿/¿/g +s/¿/¿/g +s/×/×/g +s/×/×/g +s/÷/÷/g +s/÷/÷/g +s/À/À/g +s/À/À/g +s/Á/Á/g +s/Á/Á/g +s/Â/Â/g +s/Â/Â/g +s/Ã/Ã/g +s/Ã/Ã/g +s/Ä/Ä/g +s/Ä/Ä/g +s/Å/Å/g +s/Å/Å/g +s/Æ/Æ/g +s/Æ/Æ/g +s/Ç/Ç/g +s/Ç/Ç/g +s/È/È/g +s/È/È/g +s/É/É/g +s/É/É/g +s/Ê/Ê/g +s/Ê/Ê/g +s/Ë/Ë/g +s/Ë/Ë/g +s/Ì/Ì/g +s/Ì/Ì/g +s/Í/Í/g +s/Í/Í/g +s/Î/Î/g +s/Î/Î/g +s/Ï/Ï/g +s/Ï/Ï/g +s/Ð/Ð/g +s/Ð/Ð/g +s/Ñ/Ñ/g +s/Ñ/Ñ/g +s/Ò/Ò/g +s/Ò/Ò/g +s/Ó/Ó/g +s/Ó/Ó/g +s/Ô/Ô/g +s/Ô/Ô/g +s/Õ/Õ/g +s/Õ/Õ/g +s/Ö/Ö/g +s/Ö/Ö/g +s/Ø/Ø/g +s/Ø/Ø/g +s/Ù/Ù/g +s/Ù/Ù/g +s/Ú/Ú/g +s/Ú/Ú/g +s/Û/Û/g +s/Û/Û/g +s/Ü/Ü/g +s/Ü/Ü/g +s/Ý/Ý/g +s/Ý/Ý/g +s/Þ/Þ/g +s/Þ/Þ/g +s/ß/ß/g +s/ß/ß/g +s/à/à/g +s/à/à/g +s/á/á/g +s/á/á/g +s/â/â/g +s/â/â/g +s/ã/ã/g +s/ã/ã/g +s/ä/ä/g +s/ä/ä/g +s/å/å/g +s/å/å/g +s/æ/æ/g +s/æ/æ/g +s/ç/ç/g +s/ç/ç/g +s/è/è/g +s/è/è/g +s/é/é/g +s/é/é/g +s/ê/ê/g +s/ê/ê/g +s/ë/ë/g +s/ë/ë/g +s/ì/ì/g +s/ì/ì/g +s/í/í/g +s/í/í/g +s/î/î/g +s/î/î/g +s/ï/ï/g +s/ï/ï/g +s/ð/ð/g +s/ð/ð/g +s/ñ/ñ/g +s/ñ/ñ/g +s/ò/ò/g +s/ò/ò/g +s/ó/ó/g +s/ó/ó/g +s/ô/ô/g +s/ô/ô/g +s/õ/õ/g +s/õ/õ/g +s/ö/ö/g +s/ö/ö/g +s/ø/ø/g +s/ø/ø/g +s/ù/ù/g +s/ù/ù/g +s/ú/ú/g +s/ú/ú/g +s/û/û/g +s/û/û/g +s/ü/ü/g +s/ü/ü/g +s/ý/ý/g +s/ý/ý/g +s/þ/þ/g +s/þ/þ/g +s/ÿ/ÿ/g +s/ÿ/ÿ/g +s/∀/∀/g +s/∀/∀/g +s/∂/∂/g +s/∂/∂/g +s/∃/∃/g +s/∃/∃/g +s/∅/∅/g +s/∅/∅/g +s/∇/∇/g +s/∇/∇/g +s/∈/∈/g +s/∈/∈/g +s/∉/∉/g +s/∉/∉/g +s/∋/∋/g +s/∋/∋/g +s/∏/∏/g +s/∏/∏/g +s/∑/∑/g +s/∑/∑/g +s/−/−/g +s/−/−/g +s/∗/∗/g +s/∗/∗/g +s/√/√/g +s/√/√/g +s/∝/∝/g +s/∝/∝/g +s/∞/∞/g +s/∞/∞/g +s/∠/∠/g +s/∠/∠/g +s/∧/∧/g +s/∧/∧/g +s/∨/∨/g +s/∨/∨/g +s/∩/∩/g +s/∩/∩/g +s/∪/∪/g +s/∪/∪/g +s/∫/∫/g +s/∫/∫/g +s/∴/∴/g +s/∴/∴/g +s/∼/∼/g +s/∼/∼/g +s/≅/≅/g +s/≅/≅/g +s/≈/≈/g +s/≈/≈/g +s/≠/≠/g +s/≠/≠/g +s/≡/≡/g +s/≡/≡/g +s/≤/≤/g +s/≤/≤/g +s/≥/≥/g +s/≥/≥/g +s/⊂/⊂/g +s/⊂/⊂/g +s/⊃/⊃/g +s/⊃/⊃/g +s/⊄/⊄/g +s/⊄/⊄/g +s/⊆/⊆/g +s/⊆/⊆/g +s/⊇/⊇/g +s/⊇/⊇/g +s/⊕/⊕/g +s/⊕/⊕/g +s/⊗/⊗/g +s/⊗/⊗/g +s/⊥/⊥/g +s/⊥/⊥/g +s/⋅/⋅/g +s/⋅/⋅/g +s/Α/Α/g +s/Α/Α/g +s/Β/Β/g +s/Β/Β/g +s/Γ/Γ/g +s/Γ/Γ/g +s/Δ/Δ/g +s/Δ/Δ/g +s/Ε/Ε/g +s/Ε/Ε/g +s/Ζ/Ζ/g +s/Ζ/Ζ/g +s/Η/Η/g +s/Η/Η/g +s/Θ/Θ/g +s/Θ/Θ/g +s/Ι/Ι/g +s/Ι/Ι/g +s/Κ/Κ/g +s/Κ/Κ/g +s/Λ/Λ/g +s/Λ/Λ/g +s/Μ/Μ/g +s/Μ/Μ/g +s/Ν/Ν/g +s/Ν/Ν/g +s/Ξ/Ξ/g +s/Ξ/Ξ/g +s/Ο/Ο/g +s/Ο/Ο/g +s/Π/Π/g +s/Π/Π/g +s/Ρ/Ρ/g +s/Ρ/Ρ/g +s/Σ/Σ/g +s/Σ/Σ/g +s/Τ/Τ/g +s/Τ/Τ/g +s/Υ/Υ/g +s/Υ/Υ/g +s/Φ/Φ/g +s/Φ/Φ/g +s/Χ/Χ/g +s/Χ/Χ/g +s/Ψ/Ψ/g +s/Ψ/Ψ/g +s/Ω/Ω/g +s/Ω/Ω/g +s/α/α/g +s/α/α/g +s/β/β/g +s/β/β/g +s/γ/γ/g +s/γ/γ/g +s/δ/δ/g +s/δ/δ/g +s/ε/ε/g +s/ε/ε/g +s/ζ/ζ/g +s/ζ/ζ/g +s/η/η/g +s/η/η/g +s/θ/θ/g +s/θ/θ/g +s/ι/ι/g +s/ι/ι/g +s/κ/κ/g +s/κ/κ/g +s/λ/λ/g +s/λ/λ/g +s/μ/μ/g +s/μ/μ/g +s/ν/ν/g +s/ν/ν/g +s/ξ/ξ/g +s/ξ/ξ/g +s/ο/ο/g +s/ο/ο/g +s/π/π/g +s/π/π/g +s/ρ/ρ/g +s/ρ/ρ/g +s/ς/ς/g +s/ς/ς/g +s/σ/σ/g +s/σ/σ/g +s/τ/τ/g +s/τ/τ/g +s/υ/υ/g +s/υ/υ/g +s/φ/φ/g +s/φ/φ/g +s/χ/χ/g +s/χ/χ/g +s/ψ/ψ/g +s/ψ/ψ/g +s/ω/ω/g +s/ω/ω/g +s/ϑ/ϑ/g +s/ϑ/ϑ/g +s/ϒ/ϒ/g +s/ϒ/ϒ/g +s/ϖ/ϖ/g +s/ϖ/ϖ/g +s/Œ/Œ/g +s/Œ/Œ/g +s/œ/œ/g +s/œ/œ/g +s/Š/Š/g +s/Š/Š/g +s/š/š/g +s/š/š/g +s/Ÿ/Ÿ/g +s/Ÿ/Ÿ/g +s/ƒ/ƒ/g +s/ƒ/ƒ/g +s/ˆ/ˆ/g +s/ˆ/ˆ/g +s/˜/˜/g +s/˜/˜/g +s/ / /g +s/ / /g +s/ / /g +s/ / /g +s/ / /g +s/ / /g +s/‌//g +s/‌//g +s/‍//g +s/‍//g +s/‎//g +s/‎//g +s/‏//g +s/‏//g +s/–/–/g +s/–/–/g +s/—/—/g +s/—/—/g +s/‘/‘/g +s/‘/‘/g +s/’/’/g +s/’/’/g +s/‚/‚/g +s/‚/‚/g +s/“/“/g +s/“/“/g +s/”/”/g +s/”/”/g +s/„/„/g +s/„/„/g +s/†/†/g +s/†/†/g +s/‡/‡/g +s/‡/‡/g +s/•/•/g +s/•/•/g +s/…/…/g +s/…/…/g +s/‰/‰/g +s/‰/‰/g +s/′/′/g +s/′/′/g +s/″/″/g +s/″/″/g +s/‹/‹/g +s/‹/‹/g +s/›/›/g +s/›/›/g +s/‾/‾/g +s/‾/‾/g +s/€/€/g +s/€/€/g +s/™/™/g +s/™/™/g +s/™/™/g +s/←/←/g +s/←/←/g +s/↑/↑/g +s/↑/↑/g +s/→/→/g +s/→/→/g +s/↓/↓/g +s/↓/↓/g +s/↔/↔/g +s/↔/↔/g +s/↵/↵/g +s/↵/↵/g +s/⌈/⌈/g +s/⌈/⌈/g +s/⌉/⌉/g +s/⌉/⌉/g +s/⌊/⌊/g +s/⌊/⌊/g +s/⌋/⌋/g +s/⌋/⌋/g +s/◊/◊/g +s/◊/◊/g +s/♠/♠/g +s/♠/♠/g +s/♣/♣/g +s/♣/♣/g +s/♥/♥/g +s/♥/♥/g +s/♦/♦/g +s/♦/♦/g +s/"/"/g +s/"/"/g +s/'/'/g +s/'/'/g +s/</</g +s/</</g +s/>/>/g +s/>/>/g +s/&/&/g +s/&/&/g +# http://www.w3schools.com/tags/ref_entities.asp +# ^([^ \t]+)[ \t]+(&[^;]*;)[ \t]+(&[^;]*;).*$ +# s/\2/\1/g\ns/\3/\1/g |