瀏覽代碼

Test crawler, complete documentation.

Add, Fix:
    bitshift/crawler/
        __init__.py
            -add module and crawl() docstrings.
            -add repository_queue size limit.

        crawler.py
            -account for time spent executing an API query in the run() loop
            sleep() interval.
tags/v1.0^2
Severyn Kozak 10 年之前
父節點
當前提交
b680756f8d
共有 3 個檔案被更改,包括 91 行新增33 行删除
  1. +17
    -1
      bitshift/crawler/__init__.py
  2. +74
    -32
      bitshift/crawler/crawler.py
  3. +0
    -0
      bitshift/crawler/indexer.py

+ 17
- 1
bitshift/crawler/__init__.py 查看文件

@@ -1,3 +1,9 @@
"""
:synopsis: Parent crawler module, which supervises all crawlers.

Contains functions for initializing all subsidiary, threaded crawlers.
"""

import Queue

from bitshift.crawler import crawler
@@ -5,8 +11,18 @@ from bitshift.crawler import git_indexer

__all__ = ["crawl"]

MAX_URL_QUEUE_SIZE = 5e3

def crawl():
repository_queue = Queue.Queue()
"""
Initialize all crawlers (and indexers).

Start the:
1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler`
2. Git indexer, :class:`bitshift.crawler.git_indexer.GitIndexer`
"""

repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
github_crawler = crawler.GitHubCrawler(repository_queue)
indexer = git_indexer.GitIndexer(repository_queue)



+ 74
- 32
bitshift/crawler/crawler.py 查看文件

@@ -12,46 +12,88 @@ from ..codelet import Codelet
from ..database import Database

class GitHubCrawler(threading.Thread):
"""
Crawler that retrieves links to all of GitHub's public repositories.

GitHubCrawler is a threaded singleton that queries GitHub's API for URLs
to its public repositories, which it inserts into a :class:`Queue.Queue`
shared with :class:`bitshift.crawler.git_indexer.GitIndexer`.

:ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with
repository information retrieved by `GitHubCrawler`, and other Git
crawlers, to be processed by
:class:`bitshift.crawler.git_indexer.GitIndexer`.
"""

def __init__(self, repository_queue):
"""
Create an instance of the singleton `GitHubCrawler`.

:param repository_queue: A queue containing dictionaries of repository
metadata retrieved by `GitHubCrawler`, meant to be processed by an
instance of :class:`bitshift.crawler.git_indexer.GitIndexer`.

.. code-block:: python
sample_dict = {
"url" : "https://github.com/user/repo",
"name" : "repo",
"framework_name" : "GitHub"
}

:type repository_queue: :class:`Queue.Queue`
"""


self.repository_queue = repository_queue
super(GitHubCrawler, self).__init__()

def run():
_github()
def run(self):
"""
Query the GitHub API for data about every public repository.

def _github():
"""
Query the GitHub API for data about every public repository.
Pull all of GitHub's repositories by making calls to its API in a loop,
accessing a subsequent page of results via the "next" URL returned in an
API response header. Uses Severyn Kozak's (sevko) authentication
credentials.
"""

Pull all of GitHub's repositories by making calls to its API in a loop,
accessing a subsequent page of results via the "next" URL returned in an
API response header. Uses Severyn Kozak's (sevko) authentication
credentials.
"""
next_api_url = "https://api.github.com/repositories"
authentication_params = {
"client_id" : "436cb884ae09be7f2a4e",
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
}
api_request_interval = 5e3 / 60 ** 2

next_api_url = "https://api.github.com/repositories"
authentication_params = {
"client_id" : "436cb884ae09be7f2a4e",
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
}
api_request_interval = 5e3 / 60 ** 2

while len(next_api_url) > 0:
start_time = time.time()
response = requests.get(next_api_url, params=authentication_params)

for repo in response.json():
self.repository_queue.put({
"url" : repo["html_url"],
"framework_name" : "GitHub"
while len(next_api_url) > 0:
# DEBUG
db.log.insert({
"time" : str(time.time()).split(".")[0][-4:],
"qsize" : self.repository_queue.qsize()
})
self.repository_queue.task_done()

if int(response.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())
start_time = time.time()
response = requests.get(next_api_url, params=authentication_params)

for repo in response.json():
logging.basicConfig(filename="crawler.log", level=logging.DEBUG)
logging.debug("crawler: %-20s: %-5s: %-5s: %s",
str(time.time()).split(".")[0],
self.repository_queue.qsize(), repo["id"],
repo["name"])
while self.repository_queue.full():
pass
self.repository_queue.put({
"url" : repo["html_url"],
"name" : repo["html_url"].split("/")[-1],
"framework_name" : "GitHub"
})

if int(response.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(response.headers["x-ratelimit-reset"]) -
time.time())

next_api_url = response.headers["link"].split(">")[0][1:]
next_api_url = response.headers["link"].split(">")[0][1:]

sleep_time = api_request_interval - (time.time() - start_time)
if sleep_time > 0:
time.sleep(sleep_time)
sleep_time = api_request_interval - (time.time() - start_time)
if sleep_time > 0:
time.sleep(sleep_time)

bitshift/crawler/git_indexer.py → bitshift/crawler/indexer.py 查看文件


Loading…
取消
儲存