Text Classification with DraCor

Text Classification with DraCor#

DraCor is a corpus of plays, which is made available through an extensive API. In this notebook we want to test to what extend a play’s author can be identified using only the texts they have wrote. This is a typical application of stylometry.

Creating the Corpus#

The first two functions are used to download a corpus of plays from DraCor:

from urllib import request
import json 

dracor_api = "https://dracor.org/api"                # DraCor API-endpoint


def get_dracor(corpus, play=None):
    """Loads either corpus metadata or the play's text."""
    url = dracor_api + "/corpora/" + corpus          # base URL
    if play is not None:                             # play wanted?
        url = url + "/play/" + play + "/spoken-text" # URL for the play's text
    with request.urlopen(url) as req:                # download data
        text = req.read().decode()                   # import data
        if play is None:                             # play wanted?
            return json.loads(text)                  # parse and return JSON of corpus metadata
        return text                                  # return the play's text


def get_data(corpus):
    """Download all of one corpus' plays."""
    texts = []                                       # texts of the plays
    target = []                                      # authors of the plays
    for drama in get_dracor(corpus)["dramas"]:       # iterate through all plays
        name = drama["name"]                         # play title
        authors = drama["authors"]                   # play's authors
        if len(authors) == 1:                        # keep only plays written by only one author
            texts.append(get_dracor(corpus, name))   # download text
            target.append(authors[0]["fullname"])    # add author
    return texts, target                             # return texts and authors (result of this function)

texts, target = get_data("ger")                      # download GerDraCor

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[1], line 31
     28             target.append(authors[0]["fullname"])    # add author
     29     return texts, target                             # return texts and authors (result of this function)
---> 31 texts, target = get_data("ger")                      # download GerDraCor

Cell In[1], line 27, in get_data(corpus)
     25     authors = drama["authors"]                   # play's authors
     26     if len(authors) == 1:                        # keep only plays written by only one author
---> 27         texts.append(get_dracor(corpus, name))   # download text
     28         target.append(authors[0]["fullname"])    # add author
     29 return texts, target

Cell In[1], line 12, in get_dracor(corpus, play)
     10 if play is not None:                             # play wanted?
     11     url = url + "/play/" + play + "/spoken-text" # URL for the play's text
---> 12 with request.urlopen(url) as req:                # download data
     13     text = req.read().decode()                   # import data
     14     if play is None:                             # play wanted?

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/urllib/request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    214 else:
    215     opener = _opener
--> 216 return opener.open(url, data, timeout)

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/urllib/request.py:519, in OpenerDirector.open(self, fullurl, data, timeout)
    516     req = meth(req)
    518 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 519 response = self._open(req, data)
    521 # post-process response
    522 meth_name = protocol+"_response"

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/urllib/request.py:536, in OpenerDirector._open(self, req, data)
    533     return result
    535 protocol = req.type
--> 536 result = self._call_chain(self.handle_open, protocol, protocol +
    537                           '_open', req)
    538 if result:
    539     return result

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
    494 for handler in handlers:
    495     func = getattr(handler, meth_name)
--> 496     result = func(*args)
    497     if result is not None:
    498         return result

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/urllib/request.py:1391, in HTTPSHandler.https_open(self, req)
   1390 def https_open(self, req):
-> 1391     return self.do_open(http.client.HTTPSConnection, req,
   1392         context=self._context, check_hostname=self._check_hostname)

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/urllib/request.py:1348, in AbstractHTTPHandler.do_open(self, http_class, req, **http_conn_args)
   1346 try:
   1347     try:
-> 1348         h.request(req.get_method(), req.selector, req.data, headers,
   1349                   encode_chunked=req.has_header('Transfer-encoding'))
   1350     except OSError as err: # timeout error
   1351         raise URLError(err)

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/http/client.py:1303, in HTTPConnection.request(self, method, url, body, headers, encode_chunked)
   1300 def request(self, method, url, body=None, headers={}, *,
   1301             encode_chunked=False):
   1302     """Send a complete request to the server."""
-> 1303     self._send_request(method, url, body, headers, encode_chunked)

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/http/client.py:1349, in HTTPConnection._send_request(self, method, url, body, headers, encode_chunked)
   1345 if isinstance(body, str):
   1346     # RFC 2616 Section 3.7.1 says that text default has a
   1347     # default charset of iso-8859-1.
   1348     body = _encode(body, 'body')
-> 1349 self.endheaders(body, encode_chunked=encode_chunked)

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/http/client.py:1298, in HTTPConnection.endheaders(self, message_body, encode_chunked)
   1296 else:
   1297     raise CannotSendHeader()
-> 1298 self._send_output(message_body, encode_chunked=encode_chunked)

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/http/client.py:1058, in HTTPConnection._send_output(self, message_body, encode_chunked)
   1056 msg = b"\r\n".join(self._buffer)
   1057 del self._buffer[:]
-> 1058 self.send(msg)
   1060 if message_body is not None:
   1061 
   1062     # create a consistent interface to message_body
   1063     if hasattr(message_body, 'read'):
   1064         # Let file-like take precedence over byte-like.  This
   1065         # is needed to allow the current position of mmap'ed
   1066         # files to be taken into account.

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/http/client.py:996, in HTTPConnection.send(self, data)
    994 if self.sock is None:
    995     if self.auto_open:
--> 996         self.connect()
    997     else:
    998         raise NotConnected()

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/http/client.py:1475, in HTTPSConnection.connect(self)
   1472 else:
   1473     server_hostname = self.host
-> 1475 self.sock = self._context.wrap_socket(self.sock,
   1476                                       server_hostname=server_hostname)

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/ssl.py:517, in SSLContext.wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
    511 def wrap_socket(self, sock, server_side=False,
    512                 do_handshake_on_connect=True,
    513                 suppress_ragged_eofs=True,
    514                 server_hostname=None, session=None):
    515     # SSLSocket class handles server_hostname encoding before it calls
    516     # ctx._wrap_socket()
--> 517     return self.sslsocket_class._create(
    518         sock=sock,
    519         server_side=server_side,
    520         do_handshake_on_connect=do_handshake_on_connect,
    521         suppress_ragged_eofs=suppress_ragged_eofs,
    522         server_hostname=server_hostname,
    523         context=self,
    524         session=session
    525     )

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/ssl.py:1104, in SSLSocket._create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
   1101             if timeout == 0.0:
   1102                 # non-blocking
   1103                 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1104             self.do_handshake()
   1105 except:
   1106     try:

File /opt/hostedtoolcache/Python/3.11.9/x64/lib/python3.11/ssl.py:1382, in SSLSocket.do_handshake(self, block)
   1380     if timeout == 0.0 and block:
   1381         self.settimeout(None)
-> 1382     self._sslobj.do_handshake()
   1383 finally:
   1384     self.settimeout(timeout)

KeyboardInterrupt: 

Text Classification#

Numerical data is required for most classification methods. Therefore we need to transform the texts before we can work with them. The following function changes the given data using a corresponding transformation class. It then trains and evaluates a Naive Bayes classifier for multinomial models. This classifier is typically well suited for the use in text classification.

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

def texteval(X, Y, vec):
    X = vec.fit_transform(X)                                  # transform text data
    train_X, test_X, train_Y, test_Y = train_test_split(X, Y) # split into test and training data
    clf = MultinomialNB()                                     # instantiate classificator
    clf.fit(train_X, train_Y)                                 # train model
    return clf.score(test_X, test_Y)                          # evaluate model

Now we are able to study what influence different types of text transformation have on the quality of the classification.

Word Frequency#

Let’s begin with the simplest option: Every document is represented by a vector. This vector shows the frequency of each word in the corpus within the document. We can do this using the CountVectorizer:

from sklearn.feature_extraction.text import CountVectorizer

for i in range(5):                                            # five iterations
    print(texteval(texts, target, CountVectorizer()))

Frequent Words#

Only words that appear in at least 30% of documents:

for i in range(5):
    print(texteval(texts, target, CountVectorizer(min_df=0.3)))

Rare Words#

Only words that appear in at most 30% of documents:

for i in range(5):
    print(texteval(texts, target, CountVectorizer(max_df=0.3)))

Frequent Bigrams#

Only bigrams that appear in at least 30% of documents:

vec = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=0.3)

for i in range(5):
    print(texteval(texts, target, vec))

Rare Bigrams#

Only bigrams that appear in at most 30% of documents:

vec = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', max_df=0.3)

for i in range(5):
    print(texteval(texts, target, vec))

TF-IDF#

Frequent words often are not very meaningful/informative for any given document, therefore the word frequency is often put in relation to the number of documents in which this word appears. A commonly used measure for this is TF-IDF:

from sklearn.feature_extraction.text import TfidfVectorizer

for i in range(5):
    print(texteval(texts, target, TfidfVectorizer(min_df=0.3)))

Character frequency#

We can repeat these experiments on the level of individual characters. To do this we simply need to pass a different analyzer to the CountVectorizer:

for i in range(5):
    print(texteval(texts, target, CountVectorizer(analyzer='char_wb')))

frequent characters#

for i in range(5):
    print(texteval(texts, target, CountVectorizer(analyzer='char_wb', min_df=0.3)))

rare characters#

for i in range(5):
    print(texteval(texts, target, CountVectorizer(analyzer='char_wb', max_df=0.3)))

frequent bigrams#

vec = CountVectorizer(ngram_range=(1, 2), analyzer='char_wb', min_df=0.3)

for i in range(5):
    print(texteval(texts, target, vec))

rare bigrams#

vec = CountVectorizer(ngram_range=(1, 2), analyzer='char_wb', max_df=0.3)

for i in range(5):
    print(texteval(texts, target, vec))