ftyers · steysie · Oct 30, 2018 · Oct 30, 2018 · Nov 1, 2018 · Nov 2, 2018
diff --git a/2018-komp-ling/practicals/Finite-state-transducers b/2018-komp-ling/practicals/Finite-state-transducers
@@ -0,0 +1 @@
+Here are my Practical #2 files (HFST, morphology)
diff --git a/2018-komp-ling/practicals/Morphological Disambiguation/CG3_results.txt b/2018-komp-ling/practicals/Morphological Disambiguation/CG3_results.txt
@@ -0,0 +1,51 @@
+"<Однако>"
+	"однако" ADV Degree=Pos
+"<стиль>"
+	"стиль" NOUN Animacy=Inan Case=Nom Gender=Masc Number=Sing
+;	"стиль" NOUN Animacy=Inan Case=Acc Gender=Masc Number=Sing REMOVE:18
+"<работы>"
+	"работа" NOUN Animacy=Inan Case=Gen Gender=Fem Number=Sing
+;	"работа" NOUN Animacy=Inan Case=Nom Gender=Fem Number=Plur REMOVE:19
+;	"работа" NOUN Animacy=Inan Case=Acc Gender=Fem Number=Plur REMOVE:20
+"<Семена>"
+	"Семен" PROPN Animacy=Anim Case=Gen Gender=Masc Number=Sing
+;	"Семен" PROPN Animacy=Anim Case=Acc Gender=Masc Number=Sing REMOVE:21
+"<Еремеевича>"
+	"Еремеевич" PROPN Animacy=Anim Case=Gen Gender=Masc Number=Sing
+"<заключался>"
+	"заключаться" VERB Aspect=Imp Gender=Masc Mood=Ind Number=Sing Tense=Past VerbForm=Fin Voice=Mid
+"<в>"
+	"в" ADP
+"<том>"
+	"то" PRON Animacy=Inan Case=Loc Gender=Neut Number=Sing
+;	"тот" DET Case=Loc Gender=Neut Number=Sing REMOVE:14
+;	"тот" DET Case=Loc Gender=Masc Number=Sing REMOVE:14
+"<,>"
+	"," PUNCT
+"<чтобы>"
+	"чтобы" SCONJ Mood=Cnd
+"<принимать>"
+	"принимать" VERB Aspect=Imp VerbForm=Inf Voice=Act
+"<всех>"
+	"все" PRON Animacy=Anim Case=Gen Number=Plur
+;	"весь" DET Case=Gen Number=Plur REMOVE:16
+;	"весь" DET Case=Loc Number=Plur REMOVE:16
+;	"весь" DET Case=Acc Number=Plur REMOVE:16
+;	"все" PRON Animacy=Anim Case=Acc Number=Plur REMOVE:22
+"<желающих>"
+	"желать" VERB Aspect=Imp Case=Gen Number=Plur Tense=Pres VerbForm=Part Voice=Act
+"<и>"
+	"и" CCONJ
+;	"и" PART REMOVE:15
+"<лично>"
+	"лично" ADV Degree=Pos
+"<вникать>"
+	"*вникать"
+"<в>"
+	"в" ADP
+"<дело>"
+	"дело" NOUN Animacy=Inan Case=Acc Gender=Neut Number=Sing
+;	"дело" NOUN Animacy=Inan Case=Nom Gender=Neut Number=Sing REMOVE:17
+"<.>"
+	"." PUNCT
+
diff --git a/2018-komp-ling/practicals/Morphological Disambiguation/UDPipe_finnish_results.txt b/2018-komp-ling/practicals/Morphological Disambiguation/UDPipe_finnish_results.txt
@@ -0,0 +1,12 @@
+Metrics    | Precision |    Recall |  F1 Score | AligndAcc
+-----------+-----------+-----------+-----------+-----------
+Tokens     |    100.00 |    100.00 |    100.00 |
+Sentences  |    100.00 |    100.00 |    100.00 |
+Words      |    100.00 |    100.00 |    100.00 |
+UPOS       |     94.64 |     94.64 |     94.64 |     94.64
+XPOS       |     95.81 |     95.81 |     95.81 |     95.81
+Feats      |     90.77 |     90.77 |     90.77 |     90.77
+AllTags    |     89.75 |     89.75 |     89.75 |     89.75
+Lemmas     |     84.52 |     84.52 |     84.52 |     84.52
+UAS        |    100.00 |    100.00 |    100.00 |    100.00
+LAS        |    100.00 |    100.00 |    100.00 |    100.00
diff --git a/2018-komp-ling/practicals/Morphological Disambiguation/UDPipe_polish.txt b/2018-komp-ling/practicals/Morphological Disambiguation/UDPipe_polish.txt
@@ -0,0 +1,12 @@
+Metrics    | Precision |    Recall |  F1 Score | AligndAcc
+-----------+-----------+-----------+-----------+-----------
+Tokens     |    100.00 |    100.00 |    100.00 |
+Sentences  |    100.00 |    100.00 |    100.00 |
+Words      |    100.00 |    100.00 |    100.00 |
+UPOS       |    100.00 |    100.00 |    100.00 |    100.00
+XPOS       |     99.99 |     99.99 |     99.99 |     99.99
+Feats      |     99.99 |     99.99 |     99.99 |     99.99
+AllTags    |     99.99 |     99.99 |     99.99 |     99.99
+Lemmas     |     99.99 |     99.99 |     99.99 |     99.99
+UAS        |    100.00 |    100.00 |    100.00 |    100.00
+LAS        |    100.00 |    100.00 |    100.00 |    100.00
diff --git a/2018-komp-ling/practicals/Morphological Disambiguation/get_features_improved_suffixes b/2018-komp-ling/practicals/Morphological Disambiguation/get_features_improved_suffixes
@@ -0,0 +1,28 @@
+def _get_features(self, i, word, context, prev, prev2):
+		'''Map tokens into a feature representation, implemented as a
+		{hashable: float} dict. If the features change, a new model must be
+		trained.
+		'''
+		def add(name, *args):
+			features[' '.join((name,) + tuple(args))] += 1
+
+		i += len(self.START)
+		features = defaultdict(int)
+		# It's useful to have a constant feature, which acts sort of like a prior
+		add('bias')
+		add('i suffix', word[-2:])
+		add('i pref1', word[0])
+		add('i-1 tag', prev)
+		add('i-2 tag', prev2)
+		add('i tag+i-2 tag', prev, prev2)
+		add('i word', context[i])
+		add('i-1 tag+i word', prev, context[i])
+		add('i-1 word', context[i-1])
+		add('i-1 suffix', context[i-1][-2:])
+		add('i-2 word', context[i-2])
+		add('i+1 word', context[i+1])
+		add('i+1 suffix', context[i+1][-2:])
+		add('i+2 word', context[i+2])
+		#print(word, '|||', features)
+		return features
+
diff --git a/2018-komp-ling/practicals/Morphological Disambiguation/perceptron_results.txt b/2018-komp-ling/practicals/Morphological Disambiguation/perceptron_results.txt
@@ -0,0 +1,12 @@
+Metrics    | Precision |    Recall |  F1 Score | AligndAcc
+-----------+-----------+-----------+-----------+-----------
+Tokens     |    100.00 |    100.00 |    100.00 |
+Sentences  |    100.00 |    100.00 |    100.00 |
+Words      |    100.00 |    100.00 |    100.00 |
+UPOS       |     95.89 |     95.89 |     95.89 |     95.89
+XPOS       |    100.00 |    100.00 |    100.00 |    100.00
+Feats      |    100.00 |    100.00 |    100.00 |    100.00
+AllTags    |     95.89 |     95.89 |     95.89 |     95.89
+Lemmas     |    100.00 |    100.00 |    100.00 |    100.00
+UAS        |    100.00 |    100.00 |    100.00 |    100.00
+LAS        |    100.00 |    100.00 |    100.00 |    100.00
diff --git a/2018-komp-ling/practicals/Morphological Disambiguation/report.md b/2018-komp-ling/practicals/Morphological Disambiguation/report.md
@@ -0,0 +1,26 @@
+# Practical 3 - Morphological disambiguation
+#### *Anastasia Nikiforova*
+
+1. **Tagger Comparison**
+
+For tagging, I used Polish corpus. The tagging algorithms I used: UDPipe, MarMoT
+
+UDPipe results are in ```UDPipe_finnish_results.txt``` and ```UDPipe_polish.txt```, MarMoT results are in ```text.out.txt```.
+MarMoT was qutie tricky to use and result tags are rather strange. For tagging I used a pre-trained model ```pl.marmot```
+
+I hope it is the correct tagging.
+
+2. **Constraint Grammar**
+
+CG3 was quite difficult to understand at first, but http://visl.sdu.dk/~eckhard/powerpoint/CG3_Nodalida_dis.pdf made it a lot easier. 
+After reading this tutorial, writing rules was intuitive.
+
+Basically, I eliminated all incorrect tags of nouns (they follow each other, so I had to modify the rule Nominative-Accusative-Accusative into a constraint). Also, there is a constraint that doesn't allow a noun to be nominative when it follows a preposition.
+
+I actually enjoyed this task! It is very formal and logical, which is great.
+
+3. **Improving Perceptron Tagger**
+
+The results of an improved perceptron is ```perceptron_results.txt```. There is also an excerpt from the original code, showing changes to ```_get_features.function```.
+
+The best performance was when I changes the suffix-related parameters from [-3:] to [-2:]. Changing word-related parameters decreased the performance, even if combined with changed suffix-related parameters.
diff --git a/2018-komp-ling/practicals/Morphological Disambiguation/rus.cg3 b/2018-komp-ling/practicals/Morphological Disambiguation/rus.cg3
@@ -0,0 +1,26 @@
+DELIMITERS = "." ;
+
+LIST DET = DET ;
+LIST PUNCT = PUNCT ;
+LIST NOUN = NOUN ;
+LIST VERB = VERB ;
+LIST PRON = PRON ;
+LIST ADV = ADV ;
+LIST ADP = ADP ;
+LIST PART = PART ;
+
+SECTION
+
+REMOVE DET IF (1C PUNCT) ;
+REMOVE PART IF (1C ADV) ;
+REMOVE DET IF (-1C VERB) ;
+REMOVE (NOUN Case=Nom) IF (-1C ADP) ;
+REMOVE (NOUN Case=Acc) IF (1C NOUN) ;
+REMOVE (NOUN Case=Nom) IF (-1C NOUN) ;
+REMOVE (NOUN Case=Acc) IF (-1C NOUN) ;
+REMOVE (PROPN Case=Acc) IF (-1C NOUN) ;
+REMOVE (PRON Case=Acc) IF (1C (VERB Case=Gen)) ;
+
+
+
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Here are my Practical #2 files (HFST, morphology)