导入老博客(blogger)内容(附代码)

Pubdate:2010-11-11 15:53:42    Categories: python    1680 Views
Tags: python


一年多了,终于导入了blogger上老博客的内容。

出奇地顺利。从blogger中导出原博客的内容,是xml格式的文件。运行下面的代码。我的博客是用pylogs建的,后台数据库是postgresql。不过只导入了题目、正文和发布日,其它的先不管了。附代码,有类似需求的可参考。

-----------据说现在流行分割------------

blogger2pylogs.py

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""模块名
@version: $Id$
@author: U{Liu Qing}
"""
__author__ =  '刘清'
__version__=  '1.0'
__nonsense__ = ''

from blogger import *
import psycopg2, codecs

source = 'blog-11-23-2009.xml'
blogger = Blogger(open(source))
target = [i for i in blogger.posts()]
conn = psycopg2.connect("dbname = 'pylogs' host = 'localhost' user = 'username' password = 'password'")
cur = conn.cursor()
for i in range(len(target)):
    print target[i]['title']
    target[i]['title']=target[i]['title'].encode("utf-8")
    target[i]['content']=target[i]['content'].encode("utf-8")
    target[i]['id']=target[i]['id'][35:]

    cur.execute("""INSERT INTO blog_post
        (title,content,post_name,post_type,post_status,pubdate,hits,menu_order,comment_status,comment_count) VALUES
        (%(title)s,%(content)s,%(id)s,'post','publish',%(published)s,500,0,'open',0);""",target[i])
    #posts = cur.fetchall()
conn.commit()
cur.close()
conn.close()


blogger.py(这个是在网上找的,出处不可考)

#!/usr/bin/env python
# encoding: utf-8

"""A Blogger export data to Python data tool"""

__version__ = "0.1"

import re
import datetime
from xml.dom import minidom
from xml.parsers.expat import ExpatError, ErrorString

class XmlError(Exception):
    pass


class Blogger(object):
    """A class to parse Blogger export XML"""
    def __init__(self, xml):
        """takes either a string or a file-like object

        Usage:
        >>> blogger = Blogger(open('path/to.xml'))
        >>> blogger = Blogger('<xml></xml>')
        """
        if hasattr(xml, 'read'):
            """file-like object"""
            xml = xml.read()
        try:
            self.xml = minidom.parseString(xml)
        except ExpatError, error:
            raise XmlError, ErrorString(error.code)
        except TypeError, error:
            # probably wasn't XML
            raise XmlError, error

    def posts(self, with_comments=None):
        """returns a list of posts in random order

        Usage:
        >>> blogger = Blogger(open('path/to.xml'))
        >>> [i for i in blogger.posts()]
        [{
            'id':'tag:blogger.com,1999:blog-7219822.post-111599600799349916',
            'published': (2009, 1, 1, 13, 45, 34),
            'updated': (2009, 1, 1, 13, 45, 34),
            'title': "My blog post",
            'content':'<p>Something</p>'
        }]
        """
        entries = self.xml.getElementsByTagName('entry')
        for entry in entries:
            if (entry.getElementsByTagName('category')[0].getAttribute('term') ==
                'http://schemas.google.com/blogger/2008/kind#post'):
                try:
                    data = {
                        'id': get_first_node_data(entry, 'id'),
                        'published': iso8601_to_datetime(get_first_node_data(entry, 'published')),
                        'updated': iso8601_to_datetime(get_first_node_data(entry, 'updated')),
                        'title': get_first_node_data(entry, 'title'),
                        'content': get_first_node_data(entry, 'content'),
                    }
                except ExpatError, error:
                    continue
                yield data


def get_first_node_data(xml_nodes, node_name):
    """Helper function to clean up parsing of XML data

    Usage:
    >>> get_first_node_data(xml.getElementsByTagName('thing'))
    u'Text'
    """
    nodes = xml_nodes.getElementsByTagName(node_name)
    try:
        data = nodes[0].firstChild.data
    except (ExpatError, AttributeError, IndexError):
        data = u""
    except Exception, error:
        raise Exception, error
    return data

def iso8601_to_datetime(iso_date):
    """Converts an ISO-8601 date string to a Python datetime.
    We ignore microseconds for the moment as datetime only takes milliseconds.

    Usage:
    >>> iso8601_to_datetime('2009-04-24T15:48:26,000000Z')
    (2009, 04, 24, 15, 48, 26)
    """
    rx = re.compile(r'^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}),?(.*)$')
    matches = rx.match(iso_date)
    if matches:
        match_bits = [int(i) for i in matches.groups()[:6]]
        return datetime.datetime(*match_bits)
    return None

Comments(0)

相关文章

Leave You Comment
请输入左边图片中的字符
无觅相关文章插件