584 lines
17 KiB
C
584 lines
17 KiB
C
/* Emoticon message cleanser:
|
|
*
|
|
* Skeleton code written by Farhana Choudhury and Jianzhong Qi, April 2020
|
|
*
|
|
* Authorship Declaration:
|
|
* (1) I certify that the program contained in this submission is completely
|
|
* my own individual work, except where explicitly noted by comments that
|
|
* provide details otherwise. I understand that work that has been developed
|
|
* by another student, or by me in collaboration with other students,
|
|
* or by non-students as a result of request, solicitation, or payment,
|
|
* may not be submitted for assessment in this subject. I understand that
|
|
* submitting for assessment work developed by or in collaboration with
|
|
* other students or non-students constitutes Academic Misconduct, and
|
|
* may be penalized by mark deductions, or by other penalties determined
|
|
* via the University of Melbourne Academic Honesty Policy, as described
|
|
* at https://academicintegrity.unimelb.edu.au.
|
|
*
|
|
* (2) I also certify that I have not provided a copy of this work in either
|
|
* softcopy or hardcopy or any other form to any other student, and nor will
|
|
* I do so until after the marks are released. I understand that providing
|
|
* my work to other students, regardless of my intention or any undertakings
|
|
* made to me by that other student, is also Academic Misconduct.
|
|
*
|
|
* (3) I further understand that providing a copy of the assignment
|
|
* specification to any form of code authoring or assignment tutoring
|
|
* service, or drawing the attention of others to such services and code
|
|
* that may have been made available via such a service, may be regarded
|
|
* as Student General Misconduct (interfering with the teaching activities
|
|
* of the University and/or inciting others to commit Academic Misconduct).
|
|
* I understand that an allegation of Student General Misconduct may arise
|
|
* regardless of whether or not I personally make use of such solutions
|
|
* or sought benefit from such actions.
|
|
*
|
|
* Signed by: Rory Healy 964275
|
|
* Dated: 9th April 2020
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <ctype.h>
|
|
#include <string.h>
|
|
|
|
#define STAGE_NUM_ONE 1 /* stage numbers */
|
|
#define STAGE_NUM_TWO 2
|
|
#define STAGE_NUM_THREE 3
|
|
#define STAGE_NUM_FOUR 4
|
|
#define STAGE_NUM_FIVE 5
|
|
#define STAGE_HEADER "Stage %d\n==========\n" /* stage header format string */
|
|
|
|
#define MAX_MSG_LENGTH 280 /* maximum message length */
|
|
#define MAX_NUM_MSGS 100 /* maximum number of messages */
|
|
#define MAX_EMTCN_LENGTH 50 /* maximum emot. line length */
|
|
#define MAX_NUM_EMTCN 50 /* maximum number of emot. */
|
|
|
|
typedef char msg_t[MAX_MSG_LENGTH+1]; /* a message */
|
|
typedef char emtcn_t[MAX_EMTCN_LENGTH+1]; /* an emoticon */
|
|
|
|
/****************************************************************/
|
|
|
|
/* function prototypes */
|
|
void read_one_msg(msg_t one_msg, int max_len);
|
|
void print_stage_header(int stage_num);
|
|
int count_tokens(msg_t one_msg);
|
|
|
|
void stage_one(msg_t one_msg);
|
|
void stage_two(msg_t msgs[], int *num_msgs);
|
|
void stage_three(msg_t msgs[], int num_msgs);
|
|
void stage_four(emtcn_t emtcns[], int *num_emtcns);
|
|
void stage_five(msg_t msgs[], int num_msgs, emtcn_t emtcns[], int num_emtcns);
|
|
|
|
/* add your own function prototypes here */
|
|
int is_seperating_line(msg_t one_msg);
|
|
|
|
void remove_leading_commas(msg_t one_msg, int msg_len);
|
|
void remove_trailing_commas(msg_t one_msg, int msg_len);
|
|
void remove_consecutive_commas(msg_t one_msg, int msg_len);
|
|
int length_consecutive_commas(msg_t one_msg, int msg_len);
|
|
|
|
void get_next_emtcn(msg_t one_msg, emtcn_t emtcn_msg, \
|
|
int *emtcn_len, int *msg_offset);
|
|
void remove_curr_emtcn(msg_t one_msg, int emtcn_len, int offset, int *msg_len);
|
|
int get_emtcn_len(msg_t one_msg, int msg_len, int msg_offset);
|
|
int is_emtcns_same(emtcn_t emtcn1, emtcn_t emtcn2, int emtcn_len);
|
|
int is_in_dict(emtcn_t curr_emtcn, int curr_emtcn_len, \
|
|
emtcn_t emtcns[], int num_emtcns);
|
|
|
|
/****************************************************************/
|
|
|
|
/* main function controls all the action, do NOT modify this function */
|
|
int
|
|
main(int argc, char *argv[]) {
|
|
/* to hold all input messages */
|
|
msg_t msgs[MAX_NUM_MSGS];
|
|
/* to hold the number of input messages */
|
|
int num_msgs = 0;
|
|
/* to hold all input emoticons in the dictionary */
|
|
emtcn_t emtcns[MAX_NUM_EMTCN];
|
|
/* to hold the number of emoticons in the dictionary */
|
|
int num_emtcns = 0;
|
|
|
|
/* stage 1: reading the first message */
|
|
stage_one(msgs[num_msgs]);
|
|
num_msgs++;
|
|
|
|
/* stage 2: removing alphanumeric characters */
|
|
stage_two(msgs, &num_msgs);
|
|
|
|
/* stage 3: removing extra commas */
|
|
stage_three(msgs, num_msgs);
|
|
|
|
/* stage 4: reading the dictionary and finding the longest emoticon */
|
|
stage_four(emtcns, &num_emtcns);
|
|
|
|
/* stage 5: removing invalid emoticons with the help of the dictionary */
|
|
stage_five(msgs, num_msgs, emtcns, num_emtcns);
|
|
|
|
/* all done; take some rest */
|
|
return 0;
|
|
}
|
|
|
|
/* read a line of input into one_msg */
|
|
void
|
|
read_one_msg(msg_t one_msg, int max_len) {
|
|
int i = 0, c;
|
|
while (((c = getchar()) != EOF) && (c != '\n') && (c != '\r')) {
|
|
if (i < max_len) {
|
|
one_msg[i++] = c;
|
|
} else {
|
|
printf("Invalid input line, toooooooo long.\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
one_msg[i] = '\0';
|
|
}
|
|
|
|
/* print stage header given stage number */
|
|
void
|
|
print_stage_header(int stage_num) {
|
|
printf(STAGE_HEADER, stage_num);
|
|
}
|
|
|
|
/****************************************************************/
|
|
/* add your code below */
|
|
|
|
/* removes an emtcn from a msg given a length and offset */
|
|
void
|
|
remove_curr_emtcn(msg_t one_msg, int emtcn_len, int offset, int *msg_len) {
|
|
msg_t msg_before, msg_after;
|
|
int i, j;
|
|
|
|
/* the message before the emtcn not in dict */
|
|
for (i = 0; i < offset; i++) {
|
|
msg_before[i] = one_msg[i];
|
|
}
|
|
msg_before[i] = '\0';
|
|
|
|
/* the message after the emtcn not in dict */
|
|
for (j = offset + emtcn_len; one_msg[j] != '\0'; j++) {
|
|
msg_after[j - offset - emtcn_len] = one_msg[j];
|
|
}
|
|
msg_after[j - offset - emtcn_len] = '\0';
|
|
|
|
/* replacing one_msg with msg_before and msg_after */
|
|
one_msg[0] = 0;
|
|
for (i = 0; i < *msg_len - emtcn_len; i++) {
|
|
if (i < offset) {
|
|
one_msg[i] = msg_before[i];
|
|
} else {
|
|
one_msg[i] = msg_after[i - offset];
|
|
}
|
|
}
|
|
one_msg[i] = '\0';
|
|
|
|
// here is where a bug occurs in test 1
|
|
*msg_len -= emtcn_len;
|
|
}
|
|
|
|
/* finds the next emoticon in one_msg and copies it to emtcn_msg */
|
|
void
|
|
get_next_emtcn(msg_t one_msg, emtcn_t emtcn_msg, \
|
|
int *emtcn_len, int *msg_offset) {
|
|
int i;
|
|
for (i = *msg_offset; one_msg[i] != '\0'; i++) {
|
|
if (one_msg[i] == ',') {
|
|
break;
|
|
} else {
|
|
emtcn_msg[i - *msg_offset] = one_msg[i];
|
|
}
|
|
}
|
|
*emtcn_len = i - *msg_offset;
|
|
emtcn_msg[*emtcn_len] = '\0';
|
|
*msg_offset += *emtcn_len + 1; // add 1 to account for the comma
|
|
}
|
|
|
|
/* tests if two emtcns are identical */
|
|
int
|
|
is_emtcns_same(emtcn_t emtcn1, emtcn_t emtcn2, int emtcn_len) {
|
|
for (int i = 0; i < emtcn_len; i++) {
|
|
if (emtcn1[i] != emtcn2[i]) {
|
|
return 0;
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
/* tests if an emoticon is in the dictionary emtcns[] */
|
|
int
|
|
is_in_dict(emtcn_t curr_emtcn, int curr_emtcn_len, \
|
|
emtcn_t emtcns[], int num_emtcns) {
|
|
/* the current emtcn from emtcns[] being compared */
|
|
emtcn_t emtcn_from_dict;
|
|
|
|
int i, j;
|
|
for (i = 0; i < num_emtcns; i++) {
|
|
/* extracts the emoticon from the dictionary */
|
|
for (j = 0; emtcns[i][j] != ','; j++) {
|
|
emtcn_from_dict[j] = emtcns[i][j];
|
|
}
|
|
emtcn_from_dict[j] = '\0';
|
|
|
|
/* compares lengths first to avoid unnecessary function calls */
|
|
if (curr_emtcn_len == j) {
|
|
if (is_emtcns_same(emtcn_from_dict, curr_emtcn, curr_emtcn_len)) {
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* removes leading commas from a message */
|
|
void
|
|
remove_leading_commas(msg_t one_msg, int msg_len) {
|
|
/* counts the number of leading commas */
|
|
int i = 0;
|
|
while (one_msg[i] == ',') {
|
|
i += 1;
|
|
}
|
|
|
|
/* counts the number of characters copied into new_msg */
|
|
int j;
|
|
|
|
/* the message without leading commas */
|
|
msg_t new_msg;
|
|
for (j = 0; j < msg_len; j++) {
|
|
if (one_msg[j + i] == '\0') {
|
|
break;
|
|
} else {
|
|
new_msg[j] = one_msg[j + i];
|
|
}
|
|
}
|
|
new_msg[j] = '\0';
|
|
|
|
/* clears one_msg, and replaces with new_msg */
|
|
one_msg[0] = 0;
|
|
strncpy(one_msg, new_msg, j);
|
|
one_msg[j] = '\0';
|
|
}
|
|
|
|
/* removes trailing commas from a message */
|
|
void
|
|
remove_trailing_commas(msg_t one_msg, int msg_len) {
|
|
/* counts the number of trailing commas */
|
|
int i = 0;
|
|
while (one_msg[msg_len - i - 1] == ',') {
|
|
i += 1;
|
|
}
|
|
|
|
/* counts the number of characters copied into new_msg */
|
|
int j;
|
|
|
|
/* the message without trailing commas */
|
|
msg_t new_msg;
|
|
for (j = 0; j < msg_len - i; j++) {
|
|
new_msg[j] = one_msg[j];
|
|
}
|
|
new_msg[j] = '\0';
|
|
|
|
/* clears one_msg, and replaces with new_msg */
|
|
one_msg[0] = 0;
|
|
strncpy(one_msg, new_msg, j);
|
|
one_msg[j] = '\0';
|
|
}
|
|
|
|
/* removes consecutive commas from within a message */
|
|
void remove_consecutive_commas(msg_t one_msg, int msg_len) {
|
|
one_msg[msg_len] = '\0'; // safety net
|
|
|
|
int max_commas_length = length_consecutive_commas(one_msg, msg_len);
|
|
|
|
while (max_commas_length > 1) {
|
|
int i = 0, j = 0;
|
|
msg_t new_msg, msg_cat;
|
|
|
|
/* find out where the consecutive commas start */
|
|
for (i = 0; i < msg_len; i++) {
|
|
if ((one_msg[i] == ',') && (one_msg[i + 1] == ',')) {
|
|
break;
|
|
}
|
|
}
|
|
/* add all characters including one comma to new_msg */
|
|
strncpy(new_msg, one_msg, i);
|
|
new_msg[i] = '\0';
|
|
|
|
/* find out how long the rest of the message is */
|
|
for (j = 0; j < msg_len; j++) {
|
|
if (one_msg[j + i + max_commas_length - 1] == '\0') {
|
|
break;
|
|
} else {
|
|
msg_cat[j] = one_msg[j + i + max_commas_length - 1];
|
|
}
|
|
}
|
|
msg_cat[j] = '\0';
|
|
|
|
/* and add the rest of the message to new_msg (exluding the extra
|
|
* commas
|
|
*/
|
|
|
|
strncat(new_msg, msg_cat, j);
|
|
new_msg[i + j] = '\0';
|
|
|
|
/* clear one_msg and copy the edited message to it */
|
|
one_msg[0] = 0;
|
|
strncpy(one_msg, new_msg, i + j);
|
|
one_msg[i + j] = '\0';
|
|
msg_len = i + j;
|
|
|
|
/* Loop guard */
|
|
max_commas_length = length_consecutive_commas(one_msg, msg_len);
|
|
}
|
|
|
|
}
|
|
|
|
/* finds the length of the first non-one length of commas */
|
|
int length_consecutive_commas(msg_t one_msg, int msg_len) {
|
|
int i, counter = 0;
|
|
for (i = 0; i < msg_len; i++) {
|
|
if (one_msg[i] == ',') {
|
|
if (one_msg[i + 1] == ',') {
|
|
counter += 1;
|
|
continue;
|
|
} else {
|
|
if (counter) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return counter + 1;
|
|
}
|
|
|
|
/* checks if the current message is a seperating line */
|
|
int
|
|
is_seperating_line(msg_t one_msg) {
|
|
msg_t seperating_line = "###";
|
|
if (!strncmp(one_msg, seperating_line, strlen(seperating_line))) {
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* scan a message and count the number of tokens in it */
|
|
int
|
|
count_tokens(msg_t one_msg) {
|
|
/* this counts the number of commas, which is analogous
|
|
to the number of tokens */
|
|
int num_tokens = 1;
|
|
|
|
for (int i = 0; i < (int)strlen(one_msg); i++) {
|
|
if (one_msg[i] == ',') {
|
|
num_tokens += 1;
|
|
}
|
|
}
|
|
|
|
return num_tokens;
|
|
}
|
|
|
|
/****************************************************************/
|
|
/* stages 1 - 5 */
|
|
|
|
/* stage 1: reading the first message */
|
|
void
|
|
stage_one(msg_t one_msg) {
|
|
/* print stage header */
|
|
print_stage_header(STAGE_NUM_ONE);
|
|
|
|
/* read the first message */
|
|
read_one_msg(one_msg, MAX_MSG_LENGTH);
|
|
|
|
/* count and print the number of tokens */
|
|
printf("Number of tokens: %d\n\n", count_tokens(one_msg));
|
|
}
|
|
|
|
/* stage 2: removing alphanumeric characters */
|
|
void
|
|
stage_two(msg_t msgs[], int *num_msgs) {
|
|
print_stage_header(STAGE_NUM_TWO);
|
|
|
|
/* *num_msgs will be used as an index, hence why it is zeroed */
|
|
*num_msgs = 0;
|
|
|
|
/* the current message being analysed */
|
|
msg_t curr_msg;
|
|
|
|
/* as msgs[0] already has a message in it, copy that to curr_msg
|
|
* and use that in the loop first, then read more messages
|
|
*/
|
|
strncpy(curr_msg, msgs[0], strlen(msgs[0]));
|
|
curr_msg[strlen(msgs[0])] = '\0';
|
|
|
|
do {
|
|
if (is_seperating_line(curr_msg)) {
|
|
break;
|
|
}
|
|
|
|
/* the message without alphanumeric characters */
|
|
msg_t new_msg;
|
|
|
|
/* the current character length of new_msg */
|
|
int len = 0;
|
|
|
|
for (int j = 0; j <= (int)strlen(curr_msg); j++) {
|
|
if (!isalnum(curr_msg[j])) {
|
|
new_msg[len++] = curr_msg[j];
|
|
}
|
|
}
|
|
new_msg[len] = '\0';
|
|
|
|
/* Copy the edited message to msgs[] */
|
|
strncpy(msgs[*num_msgs], new_msg, len);
|
|
msgs[*num_msgs][len] = '\0';
|
|
|
|
/* prepare for the next message to be read */
|
|
read_one_msg(curr_msg, MAX_MSG_LENGTH);
|
|
*num_msgs += 1;
|
|
} while (*num_msgs <= MAX_NUM_MSGS);
|
|
|
|
for (int i = 0; i < *num_msgs; i++) {
|
|
printf("%s\n", msgs[i]);
|
|
}
|
|
printf("\n");
|
|
}
|
|
|
|
/* stage 3: removing extra commas */
|
|
void
|
|
stage_three(msg_t msgs[], int num_msgs) {
|
|
print_stage_header(STAGE_NUM_THREE);
|
|
int i, curr_msg_len;
|
|
msg_t curr_msg;
|
|
|
|
for (i = 0; i < num_msgs; i++) {
|
|
/* copy the current message stored in msgs[i] to curr_msg */
|
|
curr_msg_len = strlen(msgs[i]);
|
|
strncpy(curr_msg, msgs[i], curr_msg_len);
|
|
curr_msg[curr_msg_len] = '\0';
|
|
|
|
/* remove extra commas from curr_msg */
|
|
remove_leading_commas(curr_msg, curr_msg_len);
|
|
curr_msg_len = strlen(curr_msg);
|
|
remove_trailing_commas(curr_msg, curr_msg_len);
|
|
curr_msg_len = strlen(curr_msg);
|
|
remove_consecutive_commas(curr_msg, curr_msg_len);
|
|
curr_msg_len = strlen(curr_msg);
|
|
|
|
/* Clear the current message stored in msgs[i] and replace
|
|
* with the edited message stored in curr_msg
|
|
*/
|
|
msgs[i][0] = 0;
|
|
strncpy(msgs[i], curr_msg, strlen(curr_msg));
|
|
msgs[i][curr_msg_len] = '\0';
|
|
}
|
|
|
|
for (int j = 0; j < num_msgs; j++) {
|
|
printf("%s\n", msgs[j]);
|
|
}
|
|
printf("\n");
|
|
}
|
|
|
|
/* stage 4: reading the dictionary and finding the longest emoticon */
|
|
void
|
|
stage_four(emtcn_t emtcns[], int *num_emtcns) {
|
|
print_stage_header(STAGE_NUM_FOUR);
|
|
|
|
/* the emoticon to be read */
|
|
emtcn_t new_emtcn;
|
|
|
|
while (*num_emtcns <= MAX_NUM_EMTCN) {
|
|
read_one_msg(new_emtcn, MAX_EMTCN_LENGTH);
|
|
/* break as soon as the line is empty */
|
|
if ((int)strlen(new_emtcn) == 0) {
|
|
break;
|
|
}
|
|
strncpy(emtcns[*num_emtcns], new_emtcn, strlen(new_emtcn));
|
|
emtcns[*num_emtcns][(int)strlen(new_emtcn)] = '\0';
|
|
*num_emtcns += 1;
|
|
}
|
|
emtcn_t max_emtcn;
|
|
int max_emtcn_len = 0, curr_len, i, j;
|
|
for (i = 0; i < *num_emtcns; i++) {
|
|
/* get the length of emoticon, and compare it to the max length */
|
|
for (j = 0; j < (int)strlen(emtcns[i]); j++) {
|
|
if (emtcns[i][j] == ',') {
|
|
break;
|
|
}
|
|
}
|
|
curr_len = j;
|
|
|
|
if (curr_len > max_emtcn_len) {
|
|
max_emtcn_len = curr_len;
|
|
strncpy(max_emtcn, emtcns[i], j);
|
|
max_emtcn[j] = '\0';
|
|
}
|
|
}
|
|
printf("Emoticon total: %d\nLongest: %s\nLength: %d\n",
|
|
*num_emtcns, max_emtcn, max_emtcn_len);
|
|
printf("\n");
|
|
}
|
|
|
|
/* stage 5: removing invalid emoticons with the help of the dictionary */
|
|
void
|
|
stage_five(msg_t msgs[], int num_msgs, emtcn_t emtcns[], int num_emtcns) {
|
|
print_stage_header(STAGE_NUM_FIVE);
|
|
int i, j;
|
|
|
|
/* the length of current message from msgs[] */
|
|
int curr_msg_len = 0;
|
|
|
|
/* the current emtcn from msgs[] */
|
|
emtcn_t curr_emtcn;
|
|
int curr_emtcn_len = 0;
|
|
|
|
/* the index of the curr_emtcn from msgs[] */
|
|
int curr_offset = 0;
|
|
|
|
for (i = 0; i < num_msgs; i++) {
|
|
for (j = 0; msgs[i][j] != '\0'; j++) {
|
|
/* do nothing, count characters before \0 */
|
|
}
|
|
curr_msg_len = j;
|
|
|
|
while (curr_offset < curr_msg_len) {
|
|
/* get the next emoticon, check if it's in emtcns, and remove it
|
|
* from msgs[i] if its not in emtcns
|
|
*/
|
|
get_next_emtcn(msgs[i], curr_emtcn, &curr_emtcn_len, &curr_offset);
|
|
|
|
if (!is_in_dict(curr_emtcn, curr_emtcn_len, emtcns, num_emtcns)) {
|
|
/* this is much easier to do with strncat, but can't be used
|
|
* here due to assignment restrictions
|
|
*/
|
|
remove_curr_emtcn(msgs[i], curr_emtcn_len, \
|
|
(curr_offset - curr_emtcn_len - 1), \
|
|
&curr_msg_len);
|
|
}
|
|
}
|
|
curr_msg_len = 0;
|
|
curr_offset = 0;
|
|
}
|
|
|
|
for (i = 0; i < num_msgs; i++) {
|
|
if (msgs[i][0] == '\0') {
|
|
continue;
|
|
}
|
|
printf("%s\n", msgs[i]);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* _ _ _ _
|
|
* /\ | | (_) | | | |
|
|
* / \ | | __ _ ___ _ __ _ | |_ | |__ _ __ ___ ___
|
|
* / /\ \ | | / _` | / _ \ | '__| | | | __| | '_ \ | '_ ` _ \ / __|
|
|
* / ____ \ | | | (_| | | (_) | | | | | | |_ | | | | | | | | | | \__ \
|
|
* /_/ \_\ |_| \__, | \___/ |_| |_| \__| |_| |_| |_| |_| |_| |___/
|
|
* __/ |
|
|
* |___/
|
|
* ______ _
|
|
* /\ | ____| | |
|
|
* / \ _ __ ___ | |__ _ _ _ __ | |
|
|
* / /\ \ | '__| / _ \ | __| | | | | | '_ \ | |
|
|
* / ____ \ | | | __/ | | | |_| | | | | | |_|
|
|
* /_/ \_\ |_| \___| |_| \__,_| |_| |_| (_)
|
|
*
|
|
*/
|