Text Classification with DraCor#
DraCor is a corpus of plays, which is made available through an extensive API. In this notebook we want to test to what extend a play’s author can be identified using only the texts they have wrote. This is a typical application of stylometry.
Creating the Corpus#
The first two functions are used to download a corpus of plays from DraCor:
from urllib import request
import json
dracor_api = "https://dracor.org/api" # DraCor API-endpoint
def get_dracor(corpus, play=None):
"""Loads either corpus metadata or the play's text."""
url = dracor_api + "/corpora/" + corpus # base URL
if play is not None: # play wanted?
url = url + "/play/" + play + "/spoken-text" # URL for the play's text
with request.urlopen(url) as req: # download data
text = req.read().decode() # import data
if play is None: # play wanted?
return json.loads(text) # parse and return JSON of corpus metadata
return text # return the play's text
def get_data(corpus):
"""Download all of one corpus' plays."""
texts = [] # texts of the plays
target = [] # authors of the plays
for drama in get_dracor(corpus)["dramas"]: # iterate through all plays
name = drama["name"] # play title
authors = drama["authors"] # play's authors
if len(authors) == 1: # keep only plays written by only one author
texts.append(get_dracor(corpus, name)) # download text
target.append(authors[0]["fullname"]) # add author
return texts, target # return texts and authors (result of this function)
texts, target = get_data("ger") # download GerDraCor
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
Cell In[1], line 31
28 target.append(authors[0]["fullname"]) # add author
29 return texts, target # return texts and authors (result of this function)
---> 31 texts, target = get_data("ger") # download GerDraCor
Cell In[1], line 27, in get_data(corpus)
25 authors = drama["authors"] # play's authors
26 if len(authors) == 1: # keep only plays written by only one author
---> 27 texts.append(get_dracor(corpus, name)) # download text
28 target.append(authors[0]["fullname"]) # add author
29 return texts, target
Cell In[1], line 12, in get_dracor(corpus, play)
10 if play is not None: # play wanted?
11 url = url + "/play/" + play + "/spoken-text" # URL for the play's text
---> 12 with request.urlopen(url) as req: # download data
13 text = req.read().decode() # import data
14 if play is None: # play wanted?
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/urllib/request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context)
214 else:
215 opener = _opener
--> 216 return opener.open(url, data, timeout)
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/urllib/request.py:519, in OpenerDirector.open(self, fullurl, data, timeout)
516 req = meth(req)
518 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 519 response = self._open(req, data)
521 # post-process response
522 meth_name = protocol+"_response"
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/urllib/request.py:536, in OpenerDirector._open(self, req, data)
533 return result
535 protocol = req.type
--> 536 result = self._call_chain(self.handle_open, protocol, protocol +
537 '_open', req)
538 if result:
539 return result
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
494 for handler in handlers:
495 func = getattr(handler, meth_name)
--> 496 result = func(*args)
497 if result is not None:
498 return result
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/urllib/request.py:1391, in HTTPSHandler.https_open(self, req)
1390 def https_open(self, req):
-> 1391 return self.do_open(http.client.HTTPSConnection, req,
1392 context=self._context, check_hostname=self._check_hostname)
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/urllib/request.py:1348, in AbstractHTTPHandler.do_open(self, http_class, req, **http_conn_args)
1346 try:
1347 try:
-> 1348 h.request(req.get_method(), req.selector, req.data, headers,
1349 encode_chunked=req.has_header('Transfer-encoding'))
1350 except OSError as err: # timeout error
1351 raise URLError(err)
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/http/client.py:1303, in HTTPConnection.request(self, method, url, body, headers, encode_chunked)
1300 def request(self, method, url, body=None, headers={}, *,
1301 encode_chunked=False):
1302 """Send a complete request to the server."""
-> 1303 self._send_request(method, url, body, headers, encode_chunked)
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/http/client.py:1349, in HTTPConnection._send_request(self, method, url, body, headers, encode_chunked)
1345 if isinstance(body, str):
1346 # RFC 2616 Section 3.7.1 says that text default has a
1347 # default charset of iso-8859-1.
1348 body = _encode(body, 'body')
-> 1349 self.endheaders(body, encode_chunked=encode_chunked)
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/http/client.py:1298, in HTTPConnection.endheaders(self, message_body, encode_chunked)
1296 else:
1297 raise CannotSendHeader()
-> 1298 self._send_output(message_body, encode_chunked=encode_chunked)
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/http/client.py:1058, in HTTPConnection._send_output(self, message_body, encode_chunked)
1056 msg = b"\r\n".join(self._buffer)
1057 del self._buffer[:]
-> 1058 self.send(msg)
1060 if message_body is not None:
1061
1062 # create a consistent interface to message_body
1063 if hasattr(message_body, 'read'):
1064 # Let file-like take precedence over byte-like. This
1065 # is needed to allow the current position of mmap'ed
1066 # files to be taken into account.
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/http/client.py:996, in HTTPConnection.send(self, data)
994 if self.sock is None:
995 if self.auto_open:
--> 996 self.connect()
997 else:
998 raise NotConnected()
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/http/client.py:1475, in HTTPSConnection.connect(self)
1472 else:
1473 server_hostname = self.host
-> 1475 self.sock = self._context.wrap_socket(self.sock,
1476 server_hostname=server_hostname)
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/ssl.py:517, in SSLContext.wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
511 def wrap_socket(self, sock, server_side=False,
512 do_handshake_on_connect=True,
513 suppress_ragged_eofs=True,
514 server_hostname=None, session=None):
515 # SSLSocket class handles server_hostname encoding before it calls
516 # ctx._wrap_socket()
--> 517 return self.sslsocket_class._create(
518 sock=sock,
519 server_side=server_side,
520 do_handshake_on_connect=do_handshake_on_connect,
521 suppress_ragged_eofs=suppress_ragged_eofs,
522 server_hostname=server_hostname,
523 context=self,
524 session=session
525 )
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/ssl.py:1104, in SSLSocket._create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
1101 if timeout == 0.0:
1102 # non-blocking
1103 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1104 self.do_handshake()
1105 except:
1106 try:
File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/ssl.py:1382, in SSLSocket.do_handshake(self, block)
1380 if timeout == 0.0 and block:
1381 self.settimeout(None)
-> 1382 self._sslobj.do_handshake()
1383 finally:
1384 self.settimeout(timeout)
KeyboardInterrupt:
Text Classification#
Numerical data is required for most classification methods. Therefore we need to transform the texts before we can work with them. The following function changes the given data using a corresponding transformation class. It then trains and evaluates a Naive Bayes classifier for multinomial models. This classifier is typically well suited for the use in text classification.
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
def texteval(X, Y, vec):
X = vec.fit_transform(X) # transform text data
train_X, test_X, train_Y, test_Y = train_test_split(X, Y) # split into test and training data
clf = MultinomialNB() # instantiate classificator
clf.fit(train_X, train_Y) # train model
return clf.score(test_X, test_Y) # evaluate model
Now we are able to study what influence different types of text transformation have on the quality of the classification.
Word Frequency#
Let’s begin with the simplest option: Every document is represented by a vector. This vector shows the frequency of each word in the corpus within the document. We can do this using the CountVectorizer:
from sklearn.feature_extraction.text import CountVectorizer
for i in range(5): # five iterations
print(texteval(texts, target, CountVectorizer()))
Frequent Words#
Only words that appear in at least 30% of documents:
for i in range(5):
print(texteval(texts, target, CountVectorizer(min_df=0.3)))
Rare Words#
Only words that appear in at most 30% of documents:
for i in range(5):
print(texteval(texts, target, CountVectorizer(max_df=0.3)))
Frequent Bigrams#
Only bigrams that appear in at least 30% of documents:
vec = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=0.3)
for i in range(5):
print(texteval(texts, target, vec))
Rare Bigrams#
Only bigrams that appear in at most 30% of documents:
vec = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', max_df=0.3)
for i in range(5):
print(texteval(texts, target, vec))
TF-IDF#
Frequent words often are not very meaningful/informative for any given document, therefore the word frequency is often put in relation to the number of documents in which this word appears. A commonly used measure for this is TF-IDF:
from sklearn.feature_extraction.text import TfidfVectorizer
for i in range(5):
print(texteval(texts, target, TfidfVectorizer(min_df=0.3)))
Character frequency#
We can repeat these experiments on the level of individual characters. To do this we simply need to pass a different analyzer to the CountVectorizer:
for i in range(5):
print(texteval(texts, target, CountVectorizer(analyzer='char_wb')))
frequent characters#
for i in range(5):
print(texteval(texts, target, CountVectorizer(analyzer='char_wb', min_df=0.3)))
rare characters#
for i in range(5):
print(texteval(texts, target, CountVectorizer(analyzer='char_wb', max_df=0.3)))
frequent bigrams#
vec = CountVectorizer(ngram_range=(1, 2), analyzer='char_wb', min_df=0.3)
for i in range(5):
print(texteval(texts, target, vec))
rare bigrams#
vec = CountVectorizer(ngram_range=(1, 2), analyzer='char_wb', max_df=0.3)
for i in range(5):
print(texteval(texts, target, vec))