Browse Source

Remove some subprocesses, comment out logging.

Add:
    bitshift/crawler/
        (crawler, indexer).py
            -comment out all logging statements, as they may be causing a
            memory leak (the crawler is meant to run perpetually, meaning that,
            depending on how the `logging` module is implemented, it may be
            accumulating logged strings in memory.)

        bitshift/crawler/indexer.py
            -make `_index_repository()` and `_index_repository_codelets()`
            functions of the `GitIndexer` class.
            -replace `_get_tracked_files()` subprocess call, which found the
            files in a Git repository and removed any that were non-ASCII, with
            a pure Python solution.
            -add `_is_ascii()`.
tags/v1.0^2
Severyn Kozak 10 years ago
parent
commit
f38772760b
2 changed files with 181 additions and 106 deletions
  1. +9
    -9
      bitshift/crawler/crawler.py
  2. +172
    -97
      bitshift/crawler/indexer.py

+ 9
- 9
bitshift/crawler/crawler.py View File

@@ -34,7 +34,7 @@ class GitHubCrawler(threading.Thread):
""" """


self.clone_queue = clone_queue self.clone_queue = clone_queue
logging.info("Starting %s." % self.__class__.__name__)
# logging.info("Starting %s." % self.__class__.__name__)
super(GitHubCrawler, self).__init__(name=self.__class__.__name__) super(GitHubCrawler, self).__init__(name=self.__class__.__name__)


def run(self): def run(self):
@@ -61,10 +61,10 @@ class GitHubCrawler(threading.Thread):


queue_percent_full = (float(self.clone_queue.qsize()) / queue_percent_full = (float(self.clone_queue.qsize()) /
self.clone_queue.maxsize) * 100 self.clone_queue.maxsize) * 100
logging.info("API call made. Limit remaining: %s. Queue-size: (%d"
"%%) %d/%d" % (response.headers["x-ratelimit-remaining"],
queue_percent_full, self.clone_queue.qsize(),
self.clone_queue.maxsize))
# logging.info("API call made. Limit remaining: %s. Queue-size: (%d"
# "%%) %d/%d" % (response.headers["x-ratelimit-remaining"],
# queue_percent_full, self.clone_queue.qsize(),
# self.clone_queue.maxsize))


for repo in response.json(): for repo in response.json():
while self.clone_queue.full(): while self.clone_queue.full():
@@ -107,7 +107,7 @@ class BitbucketCrawler(threading.Thread):
""" """


self.clone_queue = clone_queue self.clone_queue = clone_queue
logging.info("Starting %s." % self.__class__.__name__)
# logging.info("Starting %s." % self.__class__.__name__)
super(BitbucketCrawler, self).__init__(name=self.__class__.__name__) super(BitbucketCrawler, self).__init__(name=self.__class__.__name__)


def run(self): def run(self):
@@ -127,9 +127,9 @@ class BitbucketCrawler(threading.Thread):


queue_percent_full = (float(self.clone_queue.qsize()) / queue_percent_full = (float(self.clone_queue.qsize()) /
self.clone_queue.maxsize) * 100 self.clone_queue.maxsize) * 100
logging.info("API call made. Queue-size: (%d%%) %d/%d" % (
queue_percent_full, self.clone_queue.qsize(),
self.clone_queue.maxsize))
# logging.info("API call made. Queue-size: (%d%%) %d/%d" % (
# queue_percent_full, self.clone_queue.qsize(),
# self.clone_queue.maxsize))


for repo in response["values"]: for repo in response["values"]:
if repo["scm"] == "git": if repo["scm"] == "git":


+ 172
- 97
bitshift/crawler/indexer.py View File

@@ -3,7 +3,7 @@
repositories. repositories.
""" """


import bs4, logging, os, Queue, re, shutil, subprocess, time, threading
import bs4, logging, os, Queue, re, shutil, string, subprocess, time, threading


from ..database import Database from ..database import Database
from ..codelet import Codelet from ..codelet import Codelet
@@ -63,10 +63,12 @@ class GitIndexer(threading.Thread):


MAX_INDEX_QUEUE_SIZE = 10 MAX_INDEX_QUEUE_SIZE = 10


logging.info("Starting.")
# logging.info("Starting.")

self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
self.git_cloner = _GitCloner(clone_queue, self.index_queue) self.git_cloner = _GitCloner(clone_queue, self.index_queue)
self.git_cloner.start() self.git_cloner.start()
self.codelet_count = 0 #debug


if not os.path.exists(GIT_CLONE_DIR): if not os.path.exists(GIT_CLONE_DIR):
os.makedirs(GIT_CLONE_DIR) os.makedirs(GIT_CLONE_DIR)
@@ -89,14 +91,91 @@ class GitIndexer(threading.Thread):


repo = self.index_queue.get() repo = self.index_queue.get()
self.index_queue.task_done() self.index_queue.task_done()
_index_repository(repo.url, repo.name, repo.framework_name)
self._index_repository(repo.url, repo.name, repo.framework_name)

def _index_repository(self, repo_url, repo_name, framework_name):
"""
Clone and index (create and insert Codeletes for) a Git repository.

`git clone` the Git repository located at **repo_url**, call
_insert_repository_codelets, then remove said repository.

:param repo_url: The url the Git repository was cloned from.
:param repo_name: The name of the repository.
:param framework_name: The name of the framework the repository is from.

:type repo_url: str
:type repo_name: str
:type framework_name: str
"""

# logging.info("Indexing repository %s." % repo_url)
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir:
try:
self._insert_repository_codelets(repo_url, repo_name,
framework_name)
except Exception as exception:
# logging.warning(
# "_insert_repository_codelets() failed: %s: %s: %s" %
# (exception.__class__.__name__, exception, repo_url))
pass

if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))

def _insert_repository_codelets(self, repo_url, repo_name, framework_name):
"""
Create and insert a Codelet for the files inside a Git repository.

Create a new Codelet, and insert it into the Database singleton, for every
file inside the current working directory's default branch (usually
*master*).

:param repo_url: The url the Git repository was cloned from.
:param repo_name: The name of the repository.
:param framework_name: The name of the framework the repository is from.

:type repo_url: str
:type repo_name: str
:type framework_name: str
"""

commits_meta = _get_commits_metadata()
for filename in commits_meta.keys():
try:
with open(filename, "r") as source_file:
source = _decode(source_file.read())
if source is None:
return
except IOError as exception:
# logging.warning(
# "_insert_repository_codelets() failed: %s: %s: %s" %
# (exception.__class__.__name__, exception, repo_url))
pass

authors = [(_decode(author),) for author in \
commits_meta[filename]["authors"]]
codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
None, authors, _generate_file_url(filename, repo_url,
framework_name),
commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"])

self.codelet_count += 1 #debug
if self.codelet_count % 500 == 0: #debug
logging.info("Number of codelets indexed: %d.", self.codelet_count) #debug

# Database.insert(codelet)


class _GitCloner(threading.Thread): class _GitCloner(threading.Thread):
""" """
A singleton Git repository cloner. A singleton Git repository cloner.


Clones the repositories crawled by :class:`crawler.GitHubCrawler` for
:class:`GitIndexer` to index.

:ivar clone_queue: (:class:`Queue.Queue`) see :ivar clone_queue: (:class:`Queue.Queue`) see
:attr:`bitshift.crawler.crawler.GitHubCrawler.clone_queue`.
:attr:`crawler.GitHubCrawler.clone_queue`.
:ivar index_queue: (:class:`Queue.Queue`) see :ivar index_queue: (:class:`Queue.Queue`) see
:attr:`GitIndexer.index_queue`. :attr:`GitIndexer.index_queue`.
""" """
@@ -112,6 +191,8 @@ class _GitCloner(threading.Thread):
:type index_queue: see :attr:`self.index_queue` :type index_queue: see :attr:`self.index_queue`
""" """


# logging.info("Starting.")

self.clone_queue = clone_queue self.clone_queue = clone_queue
self.index_queue = index_queue self.index_queue = index_queue
super(_GitCloner, self).__init__(name=self.__class__.__name__) super(_GitCloner, self).__init__(name=self.__class__.__name__)
@@ -146,16 +227,29 @@ class _GitCloner(threading.Thread):


queue_percent_full = (float(self.index_queue.qsize()) / queue_percent_full = (float(self.index_queue.qsize()) /
self.index_queue.maxsize) * 100 self.index_queue.maxsize) * 100
logging.info("Cloning %s. Queue-size: (%d%%) %d/%d" % (repo.url,
queue_percent_full, self.index_queue.qsize(),
self.index_queue.maxsize))
# logging.info("Cloning %s. Queue-size: (%d%%) %d/%d" % (repo.url,
# queue_percent_full, self.index_queue.qsize(),
# self.index_queue.maxsize))


exit_code = None
command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone" command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone"
" --single-branch %s %s/%s || pkill -f git") " --single-branch %s %s/%s || pkill -f git")
if subprocess.call(command % (GIT_CLONE_TIMEOUT, repo.url,
GIT_CLONE_DIR, repo.name), shell=True) != 0:
logging.warning("_clone_repository(): Cloning %s failed." %
repo.url)

while exit_code is None:
try:
exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT,
repo.url, GIT_CLONE_DIR, repo.name), shell=True)
except:
# logging.warning("_clone_repository() failed: %s: %s",
# exception.__class__.__name__, exception)
time.sleep(1)
continue
else:
break

if exit_code != 0:
# logging.warning("_clone_repository(): Cloning %s failed." %
# repo.url)
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
return return
@@ -203,74 +297,6 @@ class _ChangeDir(object):


os.chdir(self.old_path) os.chdir(self.old_path)


def _index_repository(repo_url, repo_name, framework_name):
"""
Clone and index (create and insert Codeletes for) a Git repository.

`git clone` the Git repository located at **repo_url**, call
_insert_repository_codelets, then remove said repository.

:param repo_url: The url the Git repository was cloned from.
:param repo_name: The name of the repository.
:param framework_name: The name of the framework the repository is from.

:type repo_url: str
:type repo_name: str
:type framework_name: str
"""

logging.info("Indexing repository %s." % repo_url)
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir:
try:
_insert_repository_codelets(repo_url, repo_name,
framework_name)
except Exception as exception:
logging.warning(
"_insert_repository_codelets() failed: %s: %s: %s" %
(exception.__class__.__name__, exception, repo_url))

if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))

def _insert_repository_codelets(repo_url, repo_name, framework_name):
"""
Create and insert a Codelet for the files inside a Git repository.

Create a new Codelet, and insert it into the Database singleton, for every
file inside the current working directory's default branch (usually
*master*).

:param repo_url: The url the Git repository was cloned from.
:param repo_name: The name of the repository.
:param framework_name: The name of the framework the repository is from.

:type repo_url: str
:type repo_name: str
:type framework_name: str
"""

commits_meta = _get_commits_metadata()
for filename in commits_meta.keys():
try:
with open(filename, "r") as source_file:
source = _decode(source_file.read())
if source is None:
return
except IOError as exception:
logging.warning(
"_insert_repository_codelets() failed: %s: %s: %s" %
(exception.__class__.__name__, exception, repo_url))

authors = [(_decode(author),) for author in \
commits_meta[filename]["authors"]]
codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
None, authors, _generate_file_url(filename, repo_url,
framework_name),
commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"])

# Database.insert(codelet)

def _generate_file_url(filename, repo_url, framework_name): def _generate_file_url(filename, repo_url, framework_name):
""" """
Return a url for a filename from a Git wrapper framework. Return a url for a filename from a Git wrapper framework.
@@ -288,19 +314,25 @@ def _generate_file_url(filename, repo_url, framework_name):
:rtype: str, or None :rtype: str, or None


.. warning:: .. warning::
`git branch` will occasionally fail, and, seeing as its a crucial
component of GitHub's repository file urls, None will be returned.
Various Git subprocesses will occasionally fail, and, seeing as the
information they provide is a crucial component of some repository file
urls, None may be returned.
""" """


if framework_name == "GitHub":
try:
default_branch = subprocess.check_output("git branch --no-color",
shell=True)[2:-1]
return "%s/blob/%s/%s" % (repo_url, default_branch, filename)
except CalledProcessError as exception:
logging.warning("_generate_file_url(): %s: %s",
exception.__class__.name, exception)
return None
try:
if framework_name == "GitHub":
default_branch = subprocess.check_output("git branch"
" --no-color", shell=True)[2:-1]
return ("%s/blob/%s/%s" % (repo_url, default_branch,
filename)).replace("//", "/")
elif framework_name == "Bitbucket":
commit_hash = subprocess.check_output("git rev-parse HEAD",
shell=True).replace("\n", "")
return ("%s/src/%s/%s" % (repo_url, commit_hash,
filename)).replace("//", "/")
except subprocess.CalledProcessError as exception:
# logging.warning("_generate_file_url() failed: %s", exception)
return None


def _get_git_commits(): def _get_git_commits():
""" """
@@ -354,12 +386,15 @@ def _get_tracked_files():
GIT_IGNORE_EXTENSIONS = ["t[e]?xt(ile)?", "m(ark)?down", "mkd[n]?", GIT_IGNORE_EXTENSIONS = ["t[e]?xt(ile)?", "m(ark)?down", "mkd[n]?",
"md(wn|t[e]?xt)?", "rst"] "md(wn|t[e]?xt)?", "rst"]


tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \
-f && -T }' $(find . -type d -name .git -prune -o -print)"),
shell=True).split("\n")[:-1]
files = []
for dirname, subdir_names, filenames in os.walk("."):
for filename in filenames:
path = os.path.join(dirname, filename)
if _is_ascii(path):
files.append(path)


valuable_files = [] valuable_files = []
for filename in tracked_files:
for filename in files:
filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE) filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
for pattern in GIT_IGNORE_FILES]) for pattern in GIT_IGNORE_FILES])
extension = filename.split(".")[-1] extension = filename.split(".")[-1]
@@ -431,7 +466,47 @@ def _decode(raw):
encoding = bs4.BeautifulSoup(raw).original_encoding encoding = bs4.BeautifulSoup(raw).original_encoding
return raw.decode(encoding) if encoding is not None else None return raw.decode(encoding) if encoding is not None else None


except Exception as exception:
logging.warning("_decode(): %s: %s", exception.__class__.__name__,
exception)
except (LookupError, UnicodeDecodeError, UserWarning) as exception:
# logging.warning("_decode() failed: %s: %s",
# exception.__class__.__name__, exception)
return None return None

def _is_ascii(filename):
"""
Heuristically determine whether a file is ASCII text or binary.

If a portion of the file contains null bytes, or the percentage of bytes
that aren't ASCII is greater than 30%, then the file is concluded to be
binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T`
operator, and is the de-facto method for in : passdetermining whether a
file is ASCII.

:param filename: The path of the file to test.

:type filename: str

:return: Whether the file is probably ASCII.
:rtype: Boolean
"""

try:
with open(filename) as source:
file_snippet = source.read(512)

if not file_snippet:
return True

ascii_characters = "".join(map(chr, range(32, 127)) +
list("\n\r\t\b"))
null_trans = string.maketrans("", "")

if "\0" in file_snippet:
return False

non_ascii = file_snippet.translate(null_trans, ascii_characters)
return not float(len(non_ascii)) / len(file_snippet) > 0.30

except IOError as exception:
# logging.warning("_is_ascii() failed: %s: %s",
# exception.__class__.__name__, exception)
return False

Loading…
Cancel
Save