[原创][python]爬取国家统计局年度数据

这里只有作者精心编写的学习经历!
回复
头像
hellohappy
网站管理员
网站管理员
帖子: 282
注册时间: 2018年11月18日, 14:27
Been thanked: 2 time

#1 [原创][python]爬取国家统计局年度数据

未读文章 hellohappy » 2019年3月11日, 21:48

前言:
    文章原创,代码非原创,代码来源于开源代码,地址为:https://github.com/yiyuezhuo/National-Data

目的:

    用python程序爬取国家统计局年度数据

方法:

    下载源码到本地,源码地址为(这是我frok下来的版本)https://github.com/hellohappy0/National-Data
    用终端(比如window系统的cmd;快捷键为win+R,输入cmd,回车)运行命令:注意终端要先到达你下载的这个目录
        python main.py --type year --date 1978-2018 --dest data2018 --tree tree2018 --encoding gbk
    即可得到1978年到2018年国家统计局国家级别年度数据。

核心源码:

    首先国家统计局是一个几乎不防爬虫的网站,所以很多反爬都不需要做,这个程序主要就是直接爬,显然这个程序是十分值得学习来增进我们对python爬虫技术的了解的,下面给出核心代码,自己体会一下作者是如何爬取的。
核心源码主程序
Show

Code: 全选

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 03 12:15:15 2016

@author: yiyuezhuo
"""

import requests
import time
import json
import os
import pickle
import pandas as pd
import numpy as np

url='http://data.stats.gov.cn/easyquery.htm'
s='''id:zb
dbcode:hgnd
wdcode:zb
m:getTree'''

dic=dict(term.split(':') for term in s.split('\n'))

res=requests.get(url,params=dic)

def check_dir(name_list):
    #if type(name_list) in [str,unicode]:
    if type(name_list) in [str,bytes]:
        name_list=name_list.replace('\\','/').split('/')
    now_path=name_list[0]
    for name in name_list[1:]:
        if not os.path.isdir(now_path):
            os.mkdir(now_path)
        now_path=os.path.join(now_path,name)

class TreeNode(object):
    url='http://data.stats.gov.cn/easyquery.htm'
    params={'id':'zb','dbcode':'hgnd','wdcode':'zb','m':'getTree'}
    def __init__(self,iid='zb',name='zb',data_me=None):
        self.id=iid
        self.name=name
        self.data_me=None#Only leaf need this field
        self.data=None
        self.children=[]
        self.leaf=None
    def get(self,force=False,verbose=True):
        if verbose:
            print('getting',self.id,self.name)
        if force or self.data==None:
            params=TreeNode.params.copy()
            params['id']=self.id
            res=requests.get(TreeNode.url,params=params)
            self.data=res.json()
            for data in self.data:
                self.children.append(TreeNode(iid=data['id'],name=data['name'],
                                              data_me=data))
            self.leaf=len(self.children)==0
    def get_recur(self,force=False,verbose=True):
        if force or self.data==None:
            self.get(verbose=verbose)
            for child in self.children:
                child.get_recur()
    def to_dict(self):
        children=[child.to_dict() for child in self.children]
        rd=self.data.copy()
        rd['children']=children
        return rd
    def display(self,level=0):
        print(' '*level+self.name+' '+self.id)
        for child in self.children:
            child.display(level+1)
    def get_all_pair(self):
        if self.leaf:
            return [(self.id,self.name)]
        else:
            rl=[]
            for child in self.children:
                rl.extend(child.get_all_pair())
            return rl
            

class Downloader(object):
    def __init__(self,tree,raw_root='raw',date='1978-2014'):
        self.tree=tree
        self.map_name=dict(tree.get_all_pair())
        self.map_json={}
        self.raw_root=raw_root
        self.date=date
    def get_params(self,valuecode):
        params={'m':'QueryData','dbcode':'hgnd',
                'rowcode':'zb','colcode':'sj',
                'wds':[],
                'dfwds':[{'wdcode':'zb','valuecode':None},
                         {'wdcode':'sj','valuecode':self.date}],
                'k1':None}
        # requests can't deal tuple,list,dict correctly,I transform
        #them to string and replace ' -> " to solve it
        params['dfwds'][0]['valuecode']=str(valuecode)#Shocked!requests can't handle unicode properly
        params['k1']=int(time.time()*1000)
        rp={key:str(value).replace("'",'"') for key,value in params.items()}
        return rp
    def download_once(self,valuecode,to_json=False):
        url='http://data.stats.gov.cn/easyquery.htm'
        params=self.get_params(valuecode)
        res=requests.get(url,params=params)
        if to_json:
            return res.json()
        else:
            return res.content
    def valuecode_path(self,valuecode):
        return os.path.join(self.raw_root,valuecode)
    def cache(self,valuecode,content):
        '''
        f=open(self.valuecode_path(valuecode),'wb')
        f.write(content)
        f.close()
        '''
        with open(self.valuecode_path(valuecode),'wb') as f:
            f.write(content)
    def is_exists(self,valuecode,to_json=False):
        if to_json:
            return self.map_json.has_key(valuecode)
        else:
            path=os.path.join(self.raw_root,valuecode)
            return os.path.isfile(path)
    def download(self,verbose=True,to_json=False):
        length=len(self.map_name)
        for index,valuecode in enumerate(self.map_name.keys()):
            if verbose:
                print('get data',valuecode,self.map_name[valuecode],'clear',float(index)/length)
            if not self.is_exists(valuecode,to_json=to_json):
                res_obj=self.download_once(valuecode,to_json=to_json)
                if to_json:
                    self.map_json[valuecode]=res_obj
                else:
                    self.cache(valuecode,res_obj)
                    
class Document(object):
    def __init__(self,raw_root='raw'):
        self.raw_root=raw_root
    def get(self,name):
        path=os.path.join(self.raw_root,name)
        with open(path,'r', encoding = 'utf8') as f:
            content=f.read()
        return content
    def get_json(self,name):
        return json.loads(self.get(name))
    def json_to_dataframe(self,dic,origin_code=True):
        assert dic['returncode']==200
        returndata=dic['returndata']
        datanodes,wdnodes=returndata['datanodes'],returndata['wdnodes']
        if not origin_code:#parse wdnodes for transform that
            wd={w['wdcode']:{ww['code']:ww['cname'] for ww in w['nodes']} for w in wdnodes}
            zb_wd,sj_wd=wd['zb'],wd['sj']
        rd={}
        for node in datanodes:
            sd={w['wdcode']:w['valuecode'] for w in node['wds']}
            zb,sj=sd['zb'],sd['sj']
            if not origin_code:
                zb,sj=zb_wd[zb],sj_wd[sj]
            rd[(sj,zb)]=node['data']['data'] if node['data']['hasdata'] else np.NaN
        df=pd.Series(rd).unstack()
        return df
    def get_dataframe(self,name,origin_code=False):
        return self.json_to_dataframe(self.get_json(name),origin_code=False)
    def to_csv(self,name,path,encoding='utf8'):
        df=self.get_dataframe(name)
        df.to_csv(path,encoding=encoding)
    def iter_tree(self,tree,path=('zb',),origin_dir=False):
        yield path,tree
        for node in tree.children:
            newpath=path+((node.id,) if origin_dir else (node.name,))
            for r in self.iter_tree(node,path=newpath):
                yield r
    def to_csv_all(self,tree,root='data',encoding='utf8'):
        for path,node in self.iter_tree(tree):
            if node.leaf:
                path_t=(root,)+path
                check_dir(path_t)
                self.to_csv(node.id,os.path.join(*path_t)+'.csv',encoding=encoding)
                
'''
def json_to_dataframe(dic,origin_code=True):
    assert dic['returncode']==200
    returndata=dic['returndata']
    datanodes,wdnodes=returndata['datanodes'],returndata['wdnodes']
    if not origin_code:#parse wdnodes for transform that
        wd={w['wdcode']:{ww['code']:ww['cname'] for ww in w['nodes']} for w in wdnodes}
        zb_wd,sj_wd=wd['zb'],wd['sj']
    rd={}
    for node in datanodes:
        sd={w['wdcode']:w['valuecode'] for w in node['wds']}
        zb,sj=sd['zb'],sd['sj']
        if not origin_code:
            zb,sj=zb_wd[zb],sj_wd[sj]
        rd[(sj,zb)]=node['data']['data']
    df=pd.Series(rd).unstack()
    return df
                
def cache(downloader,root='raw'):
    for key,value in downloader.map_json.items():
        with open(os.path.join(root,key),'wb') as f:
            json.dump(value,f)
            
def semicode(params):
    return {key:str(value).replace("'",'"') for key,value in params.items()}
    
def test_direct():
    valuecode='A0E030A'
    params={'m':'QueryData','dbcode':'hgnd',
            'rowcode':'zb','colcode':'sj',
            'wds':[],
            'dfwds':[{'wdcode':'zb','valuecode':None},
                     {'wdcode':'sj','valuecode':'1978-2014'}],
            'k1':None}
    # requests can't deal tuple,list,dict correctly,I transform
    #them to string and replace ' -> " to solve it
    params['dfwds'][0]['valuecode']=valuecode
    params['k1']=int(time.time()*1000)
    rp={key:str(value).replace("'",'"') for key,value in params.items()}
    url='http://data.stats.gov.cn/easyquery.htm'
    res=requests.get(url,params=rp)
    return res.json()

def test(tree):
    downloader=Downloader(tree)
    return downloader.download_once(downloader.map_name.keys()[0])
'''

def run(args):
    
    print('init tree')
    if os.path.isfile(args.tree):
        print('init tree by cache')
        with open(args.tree,'rb') as f:
            tree=pickle.load(f)
    else:
        print('init tree by web')
        tree=TreeNode()
        tree.get_recur()
        with open(args.tree,'wb') as f:
            print('cache tree information...')
            pickle.dump(tree,f)
            
    if not os.path.isdir(args.raw):
        os.mkdir(args.raw)
    if not os.path.isdir(args.dest):
        os.mkdir(args.dest)
    
    print('start download file')
    downloader=Downloader(tree,raw_root=args.raw,date=args.date)
    downloader.download()
    print('start transform JSON raw file to csv file')
    doc=Document(raw_root=args.raw)
    doc.to_csv_all(tree,root=args.dest,encoding=args.encoding)
    print('clear')
    
def CLI():
    import argparse
    parser = argparse.ArgumentParser(usage=u'python main.py --encoding utf-8 --date 1978-2015 --dest new_data --raw new_tree',
                                     description=u"国家数据抓取器")
    parser.add_argument('--type',default='year',help=u'抓取哪种类型的数据,目前没用')
    parser.add_argument('--encoding',default='utf-8',help=u"输出的csv文件的编码,默认的UTF8可能对Excel不友好")
    parser.add_argument('--date',default='1978-2015',help=u'请求的数据区间如 --date 1978-2015')
    parser.add_argument('--dest',default='data',help=u"输出目录")
    parser.add_argument('--raw',default='raw',help=u'中间json文件保存目录')
    parser.add_argument('--tree',default='tree',help=u'tree文件的缓存地址,默认为tree')
    
    args=parser.parse_args()
    run(args)
    

    
if __name__=="__main__":
    import sys
    if len(sys.argv)<=1:
        print('DEBUG MODE')
        print('IF YOU WANT USE IT IN CLI, YOU NEED A ARGUMENT TO ACTIVATE IT')
        
        # It provide a helper varible to support debug 
        class Args(object):
            pass
        args = Args()
        args.type = 'year'
        args.encoding = 'utf-8'
        #args.date = '1978-2014'
        args.date = '1978-2015'
        args.dest = 'data_test'
        args.raw = 'raw_test'
        args.tree = 'tree_test'
        #run(args)
    else:
        CLI()

'''
tree=TreeNode()
tree.get_recur()
tree.display()
'''

'''
with open('tree','rb') as f:
    tree=pickle.load(f)
downloader=Downloader(tree,date='1978-2014')
downloader.download()
doc=Document()
'''
核心源码转换成gbk程序
Show

Code: 全选

# -*- coding: utf-8 -*-
"""
Created on Tue May 22 21:48:50 2018

@author: yiyuezhuo
"""

import os

import argparse

parser = argparse.ArgumentParser(usage=u'python main.py data2016 data2016-gbk',
                                 description=u"convert utf8 to gbk tool")
parser.add_argument('source')
parser.add_argument('dest')

args = parser.parse_args()
source = args.source
dest = args.dest

for dir_path,sub_dir,fnames in os.walk(args.source):
    for fname in fnames:
        path = os.path.join(dir_path,fname)
        path_list = path.split(os.path.sep)
        dest_root = os.path.join(dest, os.path.sep.join(path_list[1:-1]))
        dest_path = os.path.join(dest, os.path.sep.join(path_list[1:]))
        os.makedirs(dest_root,exist_ok=True)
        with open(path,encoding='utf8') as f:
            d = f.read()
        with open(dest_path,'w',encoding='gbk') as f:
            f.write(d)
数据下载地址:

    数据的下载地址可以访问此贴:
        viewtopic.php?f=2&t=84

Link:
Hide post links
Show post links


回复