server-webapi-code-03-fixingBrokenHTML.php / php

  brokenHTML = file_get_contents('./broken.html');
  config = array('indent' => TRUE,
                 'output-html' => TRUE,
                 'wrap' => 200,
                 'clean' => TRUE);
  tidy = tidy_parse_string(brokenHTML, config, 'UTF8');
  echo tidy_get_output(tidy); 
  function processRSSFeed(xml, source)
    updatedStories = 0;
    foreach(xml->channel->item AS story)
      content = story->children( "" );
      storyContent = content->encoded;
      if (saveFeed(story->guid, source, story->title, story->pubDate, storyContent, story->link) == 2)
      updatedStories += 1;
    return updatedStories;
  function saveFeed(guid, source, title, date, content, link)
    if (strlen(guid) > 0)
      pk = md5(source . guid);
      pk = md5(source . title);
    linkID = db_connect();
    //We still don't want any HTML tags in the title of the item
    title = mysql_real_escape_string(strip_tags(title));
    //Clean broken HTML first, to avoid problems with other steps
    config = array('indent' => TRUE,
               'output-html' => TRUE,
               'wrap' => 200,
               'clean' => TRUE,
               'show-body-only' => TRUE);
      tidy = tidy_parse_string(content, config, 'UTF8');
      content = tidy_get_output(tidy);
      //Confirm HTML links are absolute, and append the url to the link
      content = preg_replace('/<a\s+.*?href=[\"\']?([^\"\'>]*)[\"\']?\s?(title=[\"\']?([^\"\'>]*)[\"\']?)?[^>]*>(.*?)<\/a>/ie',
               "cleanAndDisplayHREF('source', '\\1', '\\3', '\\4')",
      //Display images as images, but load from local server
      content = preg_replace('/<img\s+.*?src="([^\"\' >]*)"\s?(width="([0-9]*)")?\s?(height="([0-9]*)")?[^>]*>/ie',
               "retreiveImages('source', '\\0','\\1','\\2','\\3','\\4', '\\5')",
      content = mysql_real_escape_string(strip_tags(content, "<p><img><a>"));
      link = mysql_real_escape_string(link);
      source = mysql_real_escape_string(source);
      date = strtotime(date);
      if (date == -1)
        date = time();
      query = "REPLACE INTO 03_feed_raw 
      (`id`, `source`, `title`, `date`, `content`, `link`)
      ('pk', 'source', 'title', FROM_UNIXTIME('date'), 'content', 'link')"; 
      return replaceQuery(query, linkID);

