Crossin的编程教室»论坛 › 其他 › 闲聊灌水 › 新人報到

1 2 3 45 / 5 页

返回列表

楼主: manhong2112

新人報到

[复制链接]

crossin先生

174 主题	45 好友	10万积分

管理员

Rank: 9 Rank: 9 Rank: 9

发消息

41^#

发表于 2016-8-16 14:50:14 |只看该作者

Skeel 发表于 2016-8-15 22:13
做的很好，不過請問代碼編輯器是怎麼弄出來。

回帖里的代码编辑器？
有个 <> 图标的按钮

#==== Crossin的编程教室 ====#
微信ID：crossincode
网站：http://crossincode.com

使用道具举报

manhong2112

1 主题	0 好友	207 积分

中级会员

Rank: 3 Rank: 3

发消息

42^#

发表于 2016-8-22 15:28:39 |只看该作者

本帖最后由 manhong2112 于 2016-8-22 16:00 编辑

寫了一個爬蟲, 一開始用lxml總是報錯(貌似是網站的HTML有問題)...就轉用BeautifulSoup了...
雖然能用, 但總感覺哪裡不太對勁..
EDIT: 相對路徑沒被簡化, 組成了一個迴圈....
EDIT: 找到了urljoin這函數, 希望不再出bug

# from lxml import etree
from bs4 import BeautifulSoup
import urllib as request
from urlparse import urljoin
import re
import os
def getHtml(url):
res = request.urlopen(url)
try:
return url, res.read().decode("utf-8")
except Exception:
return url, res.read().decode("big5")
def extractLink(url, html):
soup = BeautifulSoup(html, "html.parser")
return set(map(lambda i: re.sub("#.*$", "", urljoin(url, i["href"])), soup.findAll('a')))
# return set(etree.HTML(html).xpath("//a/@href"))
target = "http://example.com/" # 目標網站
startAt = "http://example.com/index.html" # 起始網頁
output = "output" # 輸出
downloadedLink = set()
toBeDownload = set()
toBeDownload.add(startAt)
p = re.compile("https?://.*?/(.*)")
while len(toBeDownload) != 0:
try:
url = toBeDownload.pop()
downloadedLink.add(url)
if (url.startswith("http") or url.startswith("https")) and not url.startswith(target):
continue
print "Downloading " + url
_, content = getHtml(url)
m = re.match(p, url)
m = m.group(1)
path, file = os.path.split(m)
try:
os.makedirs(os.path.join(output, path))
except Exception:
pass
with open(os.path.join(output, m), "w") as f:
f.write(content.encode('utf8'))
newUrl = extractLink(url, content).difference(downloadedLink)
toBeDownload = toBeDownload.union(newUrl)
except Exception:
print "Failed to Download " + url

复制代码

使用道具举报

manhong2112

1 主题	0 好友	207 积分

中级会员

Rank: 3 Rank: 3

发消息

43^#

发表于 2016-10-10 21:00:38 |只看该作者

本帖最后由 manhong2112 于 2016-10-21 17:32 编辑

原本想弄個筆記本, 然後架個站自用...然後寫着寫着寫成了vcs(算嗎?算吧)......
話說混亂到自己都看不太下去了XDDD
EDIT: 換成if-else...感覺不想再看到python的lambda了....
EDIT: 大改了一下, 和把index的rollback功能刪了

import hashlib
import os
import json
import datetime
import time
dataLoc = "data"
class Enitiy(object):
def __init__(self, name, dtype, hashcode, history):
self.name = name
self.dtype = dtype
self.hashcode = hashcode
self.history = history
class Index(Enitiy):
def __init__(self, name, jsons=None, hashcode=None):
self.name = name
self.index = dict()
if jsons is not None:
for i in jsons["index"]:
if i["type"] == "Index":
self.index[i["name"]] = Index(
i["name"],
json.loads(read(i["hashcode"])),
i["hashcode"])
else:
self.index[i["name"]] = Enitiy(
i["name"],
i["type"],
i["hashcode"],
i["history"] if "history" in i else dict())
super().__init__(name, "Index", hashcode, None)
def __str__(self):
return str(self.index)
def __getitem__(self, k):
return self.index[k]
def __setitem__(self, k, v):
self.index[k] = v
def __contains__(self, item):
return item in self.index
def __len__(self):
return len(self.index)
def items(self):
return self.index.items()
def dumpJsons(self):
result = dict()
result["index"] = []
result["name"] = self.name
for k, v in self.index.items():
obj = dict()
obj["name"], obj["type"], obj["hashcode"], obj["history"] = v.name, v.dtype, v.hashcode, v.history
result["index"].append(obj)
return json.dumps(result)
_open = open
def open(hashcode, type="w+"):
path = getPath(hashcode)
os.makedirs(os.path.dirname(path), exist_ok=True)
if os.path.isfile(path):
type = "r+"
else:
type = "w"
return _open(path, type, encoding="UTF-8")
def getPath(hashcode):
return os.path.join(dataLoc, hashcode[:2], hashcode)
def hash(str):
return hashlib.sha1(str).hexdigest()
def hashStr(str):
return hash(str.encode("utf-8"))
def hashFile(name, content):
return hash((name + "|" + content).encode("utf-8"))
def read(hashcode):
with open(hashcode) as f:
return f.read()
def init():
root = None
path = getPath("root")
name = "/"
if not os.path.isfile(path):
root = Index(name)
root_json = root.dumpJsons()
hashcode = hashFile(name, root_json)
with open(hashcode) as f:
f.write(root_json)
with open("root") as f:
f.write(json.dumps({"hashcode": hashcode, "name": name, "type": "Index"}))
else:
with open("root") as f:
accessPt = json.loads(f.read())
hashcode = accessPt["hashcode"]
with open(hashcode) as f:
root = Index(name, json.loads(f.read()), hashcode)
return [root]
def end(indexStack):
root = indexStack[0]
date = datetime.datetime.now().strftime("%Y/%m/%d-%H:%M:%S")
name = "/"
content = root.dumpJsons()
hashcode = hashFile(name, content)
with open(hashcode) as f:
f.write(content)
with open("root") as f:
accessPt = json.loads(f.read())
f.seek(0)
accessPt["hashcode"] = hashcode
f.write(json.dumps(accessPt))
f.truncate()
helpCmd = {
"update": ["Usage>> update <name> <type> <content>",
"Usage>> Update or create a file"],
"rollback": ["Usage>> rollback <name> <id>",
"Usage>> Rollback to the specified id"],
"ls": ["Usage>> ls [name]",
"Usage>> list all file in current dir or specified dir"],
"cd": ["Usage>> cd <dir|'..'>",
"Usage>> cd to specified dir"],
"mkdir": ["Usage>> mkdir <dir>",
"Usage>> create a new dir at current dir"],
"read": ["Usage>> read <name>",
"Usage>> read the content of specified file"],
"history": ["Usage>> history <name>",
"Usage>> List modifiy history of specified file"],
"help": ["Usage>> <cmd>", "Usage>> Print the usage of cmd",
"Usage>> all cmd[update, rollback, ls, cd, mkdir, read, history, help]"]
}
mainCmd = {"update", "rollback", "ls", "cd",
"mkdir", "read", "history", "help", }
def update(indexStack, name, dtype, content):
index = indexStack[-1]
date = datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S")
dateid = hashStr(date)[:6]
entity_hashcode = hashFile(name, content)
with open(entity_hashcode) as f:
f.write(content)
if name not in index:
if dtype == "Index":
index[name] = Index(name, hashcode=entity_hashcode)
else:
index[name] = Enitiy(name, dtype, entity_hashcode, dict())
else:
entity = index[name]
entity.history[dateid] = (date, entity.hashcode)
entity.hashcode = entity_hashcode
def rollback(indexStack, name, datetimeid):
"""
indexStack
name = file/dir name
datetimeid = date
"""
index = indexStack[-1]
assert type(index[name]) is not Index
history = index[name].history
assert datetimeid in history
with open(history[datetimeid][1]) as f:
update(indexStack, name, index[name].dtype, f.read())
def ls(indexStack, name=None):
if name is not None:
index = Index(name, json.loads(read(indexStack[-1][name].hashcode)))
else:
index = indexStack[-1]
assert type(index) is Index
return index.name, sorted([(e.name, e.dtype) for _, e in index.items()])
def cd(indexStack, name):
"""
indexStack
name = file/dir name
"""
assert type(indexStack) is list
assert type(name) is str
if name == "..":
indexStack.pop()
else:
index = indexStack[-1]
assert type(index[name]) is Index
indexStack.append(index[name])
def getHistory(indexStack, name):
"""
indexStack
name = file name
"""
assert type(indexStack) is list
assert type(name) is str
index = indexStack[-1]
assert type(index[name]) is not Index
return index[name].history
def mkdir(indexStack, name):
assert name not in indexStack[-1]
update(indexStack, name, "Index", Index(name).dumpJsons())
if __name__ == '__main__':
indexStack = init()
try:
while True:
cmd = input("/".join([i.name for i in indexStack]) + ">> ").split(" ")
args = cmd[1:] if len(cmd) > 1 else []
if cmd[0] in mainCmd:
if cmd[0] == "update":
update(indexStack, *args)
elif cmd[0] == "rollback":
rollback(indexStack, args[0], args[1])
elif cmd[0] == "ls":
if len(args) == 1:
i = ls(indexStack, *args)
else:
i = ls(indexStack)
print(i[0] + " :"),
print("\n".join(["\t" + n + " | " + t for n, t in i[1]]))
elif cmd[0] == "cd":
cd(indexStack, *args)
elif cmd[0] == "mkdir":
if len(args) != 1:
print("E>")
else:
mkdir(indexStack, args[0])
elif cmd[0] == "read":
if len(args) != 1:
print("E>")
else:
print(read(indexStack[-1][args[0]].hashcode)),
elif cmd[0] == "history":
if len(args) == 1:
history = getHistory(indexStack, args[0])
print(args[0] + " :")
print("id\t|date")
print("\n".join(
["{}\t|{}".format(x[0], x[1]) for x in
sorted([(dateid, v[0]) for dateid, v in history.items()], key=lambda x: x[1])]))
else:
print("E>")
elif cmd[0] == "help":
exit = False
while not exit:
cmd = input("Help>> ")
if cmd == "exit" or cmd == "quit":
exit = True
elif cmd in helpCmd:
list(map(lambda x: print(x), helpCmd[cmd]))
else:
print("E> Invalid command")
elif cmd[0] == "exit" or cmd[0] == "quit":
break
else:
print("E> Invalid command")
except Exception:
raise
finally:
end(indexStack)

复制代码

使用道具举报

crossin先生

174 主题	45 好友	10万积分

管理员

Rank: 9 Rank: 9 Rank: 9

发消息

44^#

发表于 2016-10-11 09:54:18 |只看该作者

manhong2112 发表于 2016-10-10 21:00
原本想弄個筆記本, 然後架個站自用...然後寫着寫着寫成了vcs(算嗎?算吧)......
話說混亂到自己都看不太下去 ...

你厉害的

#==== Crossin的编程教室 ====#
微信ID：crossincode
网站：http://crossincode.com

使用道具举报

manhong2112

1 主题	0 好友	207 积分

中级会员

Rank: 3 Rank: 3

发消息

45^#

发表于 2016-10-14 19:58:04 |只看该作者

和之前的筆記本一系列的, 用來抽取指定網址的內容!
目前只做了知乎, 以後再慢慢拓展
#註解掉的是md轉html的

from bs4 import BeautifulSoup
import urllib.request as urllib
import re
#import markdown
def getContext(url):
data = parse(url, getRule(url))
md = """\
{title}
{author}
{url}
{content}\
""".format(
title=("#" + data["title"]) if "title" in data else "",
author=("###" + data["author"]) if "author" in data else "",
url=("#####[Link](" + data["url"] + ")") if "url" in data else "",
content=("<hr><br>" + data["content"]) if "content" in data else "")
#return "<meta charset='UTF-8'>\n" + markdown.markdown(md, output_format="html5")
return md
rule = [
(re.compile(r"https?://www.zhihu.com/question/\d*?/answer/\d*?#?.*"),
{"content": {"selector": "div#zh-question-answer-wrap .zm-editable-content"},
"title": {"selector": "div#zh-question-title > h2.zm-item-title > a"},
"author": {"selector": "span.author-link-line > a.author-link"},
"url": {"selector": "meta['http-equiv'='mobile-agent']", "attr": "content", "matching": re.compile(".*url=(.*)")}
})
]
def getRule(url):
for k, v in rule:
if k.match(url):
return v
def getHtml(url, encoding="utf-8"):
res = urllib.urlopen(url)
return res.read()
def parse(url, rule):
result = dict()
soup = BeautifulSoup(getHtml(url), "html.parser")
for (k, v) in rule.items():
content = None
if "attr" in v:
content = str(soup.select(v["selector"])[0][v["attr"]]).strip(' \t\n\r')
else:
content = str(soup.select(v["selector"])[0].get_text()).strip(' \t\n\r')
if "matching" in v:
m = v["matching"].match(content)
if m:
result[k] = m.group(1)
else:
result[k] = content
return result
print(getContext("http://www.zhihu.com/question/51266789/answer/125952575"))

复制代码

使用道具举报

1 2 3 45 / 5 页

返回列表

		自动登录	找回密码
密码			立即加入