<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article
  PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD with MathML3 v1.2 20190208//EN" "JATS-journalpublishing1-mathml3.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.2" xml:lang="en">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">JBDGM</journal-id>
<journal-id journal-id-type="nlm-ta">Jahrb Musikpsychol</journal-id>
<journal-title-group>
<journal-title>Jahrbuch Musikpsychologie</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Jahrb. Musikpsychol.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2569-5665</issn>
<publisher><publisher-name>PsychOpen</publisher-name></publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">jbdgm.221</article-id>
<article-id pub-id-type="doi">10.5964/jbdgm.221</article-id>
<article-categories>
<subj-group subj-group-type="heading"><subject>Research Reports</subject></subj-group>

<subj-group subj-group-type="badge">
<subject>Data</subject>
<subject>Code</subject>
<subject>Materials</subject>
	<subject>Preregistration</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>The Creative Musical Achievement of AI Systems Compared to Music Students: A Replication of the Study by Schreiber et al. (2024)</article-title>
<trans-title-group xml:lang="de">
<trans-title>Die kreativen musikalischen Leistungen von KI-Systemen im Vergleich zu Musikstudierenden: Eine Replikation der Studie von Schreiber et al. (2024)</trans-title>
</trans-title-group>
<alt-title alt-title-type="right-running">The Creative Musical Achievement of AI Systems</alt-title>
<alt-title specific-use="APA-reference-style" xml:lang="en">The creative musical achievement of AI systems compared to music students: A replication of the study by Schreiber et al. (2024)</alt-title>
</title-group>
<contrib-group>
	
	<contrib contrib-type="author"><contrib-id contrib-id-type="orcid" authenticated="false">https://orcid.org/0009-0009-4064-7419</contrib-id><name name-style="western"><surname>Meier</surname><given-names>Nicholas</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
	<role vocab="credit" vocab-identifier="http://credit.niso.org/"
		vocab-term="Conceptualization"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/conceptualization/"
		>Conceptualization</role>
	<role vocab="credit" vocab-identifier="http://credit.niso.org/"
		vocab-term="Methodology"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/methodology/"
		>Methodology</role>
	<role vocab="credit" vocab-identifier="http://credit.niso.org/"
		vocab-term="Investigation"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/investigation/"
		>Investigation</role>
	<role vocab="credit" vocab-identifier="http://credit.niso.org/"
		vocab-term="Data curation"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/data-curation/"
		>Data curation</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Formal Analysis"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/formal-analysis/"
		>Formal analysis</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Visualization"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/visualization/"
		>Visualization</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Writing – original draft"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/writing-original-draft/"
		>Writing – original draft</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Funding acquisition"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/funding-acquisition/"
		>Funding acquisition</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Project administration"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/project-administration/"
		>Project administration</role>
</contrib>
	
	
	<contrib contrib-type="author"><contrib-id contrib-id-type="orcid" authenticated="false">https://orcid.org/0000-0003-3770-7483</contrib-id><name name-style="western"><surname>Sander</surname><given-names>Kilian</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
	<role vocab="credit" vocab-identifier="http://credit.niso.org/"
		vocab-term="Conceptualization"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/conceptualization/"
		>Conceptualization</role>
	<role vocab="credit" vocab-identifier="http://credit.niso.org/"
		vocab-term="Methodology"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/methodology/"
		>Methodology</role>
	<role vocab="credit" vocab-identifier="http://credit.niso.org/"
		vocab-term="Investigation"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/investigation/"
		>Investigation</role>
	<role vocab="credit" vocab-identifier="http://credit.niso.org/"
		vocab-term="Data curation"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/data-curation/"
		>Data curation</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Formal Analysis"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/formal-analysis/"
		>Formal analysis</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Visualization"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/visualization/"
		>Visualization</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Supervision"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/supervision/"
		>Supervision</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Validation"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/validation/"
		>Validation</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Writing – review &amp; editing"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/writing-review-editing/"
		>Writing – review &amp; editing</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Funding acquisition"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/funding-acquisition/"
		>Funding acquisition</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Project administration"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/project-administration/"
		>Project administration</role>
</contrib>
	
	
	<contrib contrib-type="author"><contrib-id contrib-id-type="orcid" authenticated="false">https://orcid.org/0009-0003-4618-1783</contrib-id><name name-style="western"><surname>Schreiber</surname><given-names>Anton</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
	<role vocab="credit" vocab-identifier="http://credit.niso.org/"
		vocab-term="Conceptualization"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/conceptualization/"
		>Conceptualization</role>
	<role vocab="credit" vocab-identifier="http://credit.niso.org/"
		vocab-term="Methodology"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/methodology/"
		>Methodology</role>
</contrib>
	
	
	<contrib contrib-type="author" corresp="yes"><contrib-id contrib-id-type="orcid" authenticated="false">https://orcid.org/0000-0003-3356-3478</contrib-id><name name-style="western"><surname>Kopiez</surname><given-names>Reinhard</given-names></name><xref ref-type="corresp" rid="cor1">*</xref><xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
	<role vocab="credit" vocab-identifier="http://credit.niso.org/"
		vocab-term="Conceptualization"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/conceptualization/"
		>Conceptualization</role>
	<role vocab="credit" vocab-identifier="http://credit.niso.org/"
		vocab-term="Methodology"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/methodology/"
		>Methodology</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Supervision"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/supervision/"
		>Supervision</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Validation"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/validation/"
		>Validation</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Writing – review &amp; editing"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/writing-review-editing/"
		>Writing – review &amp; editing</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Funding acquisition"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/funding-acquisition/"
		>Funding acquisition</role>
	<role
		vocab="credit"
		vocab-identifier="http://credit.niso.org/"
		vocab-term="Project administration"
		vocab-term-identifier="http://credit.niso.org/contributor-roles/project-administration/"
		>Project administration</role>
</contrib>
	
	
	
	<contrib contrib-type="reviewer"><name name-style="western"><surname>Zaddach</surname><given-names>Wolf-Georg</given-names></name></contrib>
	<contrib contrib-type="reviewer"><name name-style="western"><surname>Schlemmer</surname><given-names>Kathrin</given-names></name></contrib>
	
<aff id="aff1"><label>1</label><institution>Hanover University of Music, Drama and Media</institution>, <addr-line><city>Hannover</city></addr-line>, <country country="DE">Germany</country></aff>

</contrib-group>
<author-notes>
<corresp id="cor1"><label>*</label>Hanover Music Lab, Hanover University of Music, Drama and Media, Neues Haus 1, 30175 Hannover, Germany. <email xlink:href="reinhard.kopiez@hmtm-hannover.de">reinhard.kopiez@hmtm-hannover.de</email></corresp>
</author-notes>
<pub-date date-type="pub" publication-format="electronic"><day>16</day><month>07</month><year>2025</year></pub-date>
	<pub-date pub-type="collection" publication-format="electronic"><year>2025</year></pub-date>
<volume>33</volume><elocation-id>e221</elocation-id>
<history>
<date date-type="received">
<day>26</day>
<month>03</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>14</day>
<month>06</month>
<year>2025</year>
</date>
</history>
<permissions><copyright-year>2025</copyright-year><copyright-holder>Meier, Sander, Schreiber, &amp; Kopiez</copyright-holder><license license-type="open-access" specific-use="CC BY 4.0" xlink:href="https://creativecommons.org/licenses/by/4.0/"><ali:license_ref>https://creativecommons.org/licenses/by/4.0/</ali:license_ref><license-p>This is an open-access article distributed under the terms of the Creative Commons Attribution (CC BY) 4.0 License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p></license></permissions>
<abstract>
<p>Although the last two years have seen AI systems progress significantly when it comes to generating cultural products like literature, poems, or music, the jury is still out when it comes to determining whether the aesthetic quality of these products increases in tandem with the performance enhancements of underlying large language models (LLMs). We replicated the study by Schreiber et al. (2024) to test whether the creative performance of selected LLMs had improved over the past two years in the musical domain. In an online rating experiment based on a melody continuation paradigm, 75 melodic continuations generated by the AI systems <italic>Qwen 2</italic> (Version 72B Instruct), <italic>Llama 3</italic> (Version 70B Instruct), and <italic>ChatGPT</italic> (Version 4) were compared to 23 solutions composed by humans. The aesthetic quality of the sound examples was then evaluated by <italic>N</italic> = 54 listeners (music students) using four criteria (convincing, logical and meaningful, interesting, and liking). As the first main finding, human-based creative solutions outperformed all three AI systems on all four dependent variables (large effect sizes 1.11 ≤ <italic>d<sub>z</sub></italic> ≤ 2.51), thus confirming the finding by Schreiber et al. (2024). The second main finding revealed a mean (and meaningful) discrimination sensitivity of <italic>d’</italic> = 1.09 for AI- and human-based solutions. We conclude that merely boosting the volume of training of the AI systems does not guarantee correlating improvement in the creative musical output produced under controlled conditions.</p>
</abstract><trans-abstract xml:lang="de">
<p>Obwohl KI-Systeme in den letzten Jahren erhebliche Fortschritte bei der Erzeugung kultureller Produkte wie Literatur, Poesie oder Musik gemacht haben, bleibt die Frage offen, ob die ästhetische Qualität dieser Produkte mit der zunehmenden allgemeinen Leistungsfähigkeit der large language models (LLMs) ebenfalls angewachsen ist. In einer Replikation der Studie von Schreiber et al. (2024), überprüften wir, ob die kreative Leistungsfähigkeit ausgewählter LLMs auf dem Gebiet der Musik zugenommen hat. In einem Online-Rating-Experiment und unter Verwendung eines Melodiefortsetzungsparadigmas wurden 75 Melodiefortsetzungen der KI-Systeme <italic>Qwen 2</italic> (Version 72B Instruct), <italic>Llama 3</italic> (Version 70B Instruct) und <italic>ChatGPT</italic> (Version 4) mit 23 Fortsetzungsvarianten von Musikstudierenden verglichen. Die ästhetische Qualität der Fortsetzungen wurde von <italic>N</italic> = 54 Hörer*innen (Musikstudierende) mittels vier Items (überzeugend, logisch und sinnvoll, interessant, Gefallen) erfasst. Als erstes Hauptergebnis wurden die menschlichen Lösungen auf allen vier Bewertungsmerkmalen besser beurteilt als die KI-Lösungen (große Effektgröße 1.11 ≤ <italic>d<sub>z</sub></italic> ≤ 2.51), was die Ergebnisse von Schreiber et al. (2024) bestätigt. Das zweite Hauptergebnis zeigte eine mittlere Diskriminationssensitivität für die Identifikation des Ursprungs der Melodiefortsetzungen (<italic>d’</italic> = 1.09). Wir schlussfolgern, dass eine bloße Steigerung der Trainingsquantität von KI-Systemen keine Garantie für eine gleichfalls zunehmende ästhetische Qualität des unter kontrollierten Bedingungen erzeugten musikalischen Outputs bedeutet.</p></trans-abstract>
<kwd-group kwd-group-type="author"><kwd>Artificial Intelligence</kwd><kwd>AI</kwd><kwd>generative AI</kwd><kwd>composition</kwd><kwd>empirical aesthetics</kwd><kwd>melody rating</kwd><kwd>musical creativity</kwd><kwd>large language models</kwd></kwd-group>
<kwd-group kwd-group-type="translator" xml:lang="de"><kwd>Künstliche Intelligenz</kwd><kwd>KI</kwd><kwd>generative KI</kwd><kwd>Komposition</kwd><kwd>empirische Ästhetik</kwd><kwd>Melodiebewertung</kwd><kwd>musikalische Kreativität</kwd><kwd>Sprachmodelle</kwd></kwd-group>
</article-meta>
</front>
<body>
<sec sec-type="other1"><title>Background</title>
	<p>Following the emergence of user-friendly applications like <italic>ChatGPT</italic> (<xref ref-type="bibr" rid="r26">OpenAI, 2022</xref>, <xref ref-type="bibr" rid="r27">2023a</xref>, <xref ref-type="bibr" rid="r29">2024</xref>) in recent years, <italic>artificial intelligence</italic> (AI) has already begun to play an increasingly important role in the everyday lives of younger people in particular. Tools like <italic>DALL·E 3</italic> (<xref ref-type="bibr" rid="r28">OpenAI, 2023b</xref>) or the generative features implemented in the newer versions of <italic>Photoshop</italic> (<xref ref-type="bibr" rid="r1">Adobe, 2023</xref>) further underline the potential of AI to lower the bar to entry still further when it comes to image creation. And it’s a similar story in the musical domain, with strong interest in automating composition processes since the emergence of musical dice games during the eighteenth century (<xref ref-type="bibr" rid="r34">Steinbeck, 2016</xref>). At the end of the twentieth century, the topic prominently resurfaced in the <italic>Experiments in Musical Intelligence</italic> by the American composer David <xref ref-type="bibr" rid="r9">Cope (1996)</xref> and has been top of mind for empirical researchers ever since.</p>
<p>Almost thirty years later, both hardware and software for artificial music creation have become far more powerful and penetrated across the board. Specialized server-based applications like <italic>AIVA</italic> (<xref ref-type="bibr" rid="r2">Aiva Technologies, 2016</xref>) or <italic>Suno AI</italic> (<xref ref-type="bibr" rid="r35">Suno, 2024</xref>) are readily available to all with a smartphone and an internet connection. These differ, among other things, in their openness to input and output formats (e.g., MIDI, Python, text-based prompts, sampled sounds). This paves the way for innumerable potential applications involving the deployment of AI systems in co-creativity processes between humans and machines (<xref ref-type="bibr" rid="r18">Gioti, 2021</xref>), which may pose a threat to the livelihood of traditional music creators due to, for example, the time and cost efficiency of AI-generated art. This worry is already very much present today, as shown in a recent report conducted on behalf of the German and French collecting societies <italic>GEMA</italic> and <italic>SACEM</italic>, in which 71% of the surveyed members stated that they see their economic foundation threatened by AI (<xref ref-type="bibr" rid="r19">Goldmedia, 2024</xref>). Despite an obvious public interest in the topic, research into the creative musical potential of various AI models remains limited. Few studies have conducted controlled blind evaluations and/or compared the compositions of generative AI models with those of human musicians (<xref ref-type="bibr" rid="r32">Schreiber et al., 2024</xref>).</p>
<p>Although <xref ref-type="bibr" rid="r25">Oksanen et al. (2023)</xref> systematically reviewed a total of 44 empirical studies on AI in the fine arts between 2003 and 2021, only ten fell into the domain of music. Moreover, the content of these studies differed significantly. For example, <xref ref-type="bibr" rid="r17">Frieler and Zaddach (2022)</xref> examined participants’ ratings of jazz solos, which were either human- or AI-composed. The solos of professional jazz musicians were rated better on average than those of the AI model. Jazz experts were also able to recognize the AI compositions with an accuracy of 64.4%, as opposed to an accuracy of 41.7% for non-experts. <xref ref-type="bibr" rid="r15">Ferreira et al. (2023)</xref> also investigated the discrimination skills of participants with different musical expertise in the domain of classical piano music.</p>
	<p>As part of his early experiments, David Cope already described a discrimination test to differentiate between human-made and artificially created music. He called it "The Game" (<xref ref-type="bibr" rid="r10">Cope, 2001</xref>), and the average success rate of participants always seemed to hover between 40% and 60% (<xref ref-type="bibr" rid="r9">Cope, 1996</xref>). Participants with a success rate of over 66% earned the label “high-scorers” (<xref ref-type="bibr" rid="r10">Cope, 2001</xref>).</p>
	<p>In a recent study by <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref>, the authors exploratively analyzed the aesthetic ratings of melody continuations composed by the AI systems ChatGPT 3.5 (<xref ref-type="bibr" rid="r26">OpenAI, 2022</xref>) and <italic>Magenta Studio</italic> 2 (<xref ref-type="bibr" rid="r20">Google AI, 2023</xref>) as well as by a group of music students in a standardized melody continuation task. Participants were presented with ten AI- and ten human-composed melody continuations, respectively. The authors found that the human compositions were rated significantly higher in terms of subjectively perceived quality on all four rating scales used (liking, interesting, logical and meaningful, and convincing) compared to the AI melodies, <italic>F</italic>(1, 67) = 91.114, <italic>p</italic> &lt; .001, Pillai’s trace = 0.857, η<sup>2</sup> = 0.576. Neither the length of the given melodies nor the musical expertise of the participants impacted the ratings significantly.</p>
<sec><title>Research Questions and Study Aim</title>
	<p>This study aims to conceptually replicate the findings of <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref> using an updated selection of topical (as of June 2024) <italic>large language models</italic> (LLMs): Qwen 2 72B Instruct (<xref ref-type="bibr" rid="r3">Alibaba Cloud, 2024</xref>), Llama 3 70B Instruct (<xref ref-type="bibr" rid="r22">Meta, 2024</xref>), and GPT-4 (<xref ref-type="bibr" rid="r27">OpenAI, 2023a</xref>). Although music-specific AI applications (e.g., UDIO or SUNO) may produce better results, they currently lack open input formats (e.g., based on Python code) required for the application of a standardized melody continuation task. Therefore, these systems are outside the scope of our research. In line with the definition by <xref ref-type="bibr" rid="r37">Wooldridge and Jennings (1995)</xref>, the selected LLMs are termed “AI systems” as they do not fulfill the criteria of proactive, flexible, or cooperative behavior with other computing systems, which would characterize “AI agent” systems. The AI systems were accessed via the platform <italic>AcademicCloud</italic> (<ext-link ext-link-type="uri" xlink:href="https://academiccloud.de/">https://academiccloud.de/</ext-link>), which is available to all university members in Lower Saxony.</p>
<p>To extend the stimulus basis of the original study, we used a different melody from the domain of popular music as a starting point for the standardized continuation task. This replication also encompassed the implication of a discrimination task: Based on the <italic>signal detection theory</italic> (SDT), we were interested in the respective discriminative performances of humans and AI systems vis-a-vis creative products.</p>
<p>The following research questions (RQ) were formulated: (RQ<sub>1</sub>) Can the significantly poorer subjective assessments of AI systems compared to human compositions in <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref> be replicated with another melody task based on a similar highly expert group of participants (music students)? (RQ<sub>2</sub>) Do any significant differences emerge in the subjectively perceived quality of the creative products among the three different LLMs used? (RQ<sub>3</sub>) Is the overall discrimination performance in the given task (humans vs. AI composer) better than would otherwise happen by chance?</p></sec>
<sec><title>Hypotheses</title>
<p>Based on the results by <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref>, our first two hypotheses are as follows:</p>
<list id="L1" list-type="bullet">
<list-item>
<p>(H<sub>1</sub>) The subjective quality ratings for melody continuations by humans significantly exceed those obtained by three different AI systems.</p></list-item>
<list-item>
<p>(H<sub>2</sub>) The subjective ratings of aesthetic quality differ significantly when the results obtained from the three different AI systems are compared.</p></list-item>
</list>
<p>Based on the results by <xref ref-type="bibr" rid="r9">Cope (1996)</xref>, our third hypothesis is as follows:</p>
<list id="L2" list-type="bullet">
<list-item>
<p>(H<sub>3</sub>) The perceptual discrimination between continuations by humans vs. AI systems does not exceed an extent that would otherwise happen by chance.</p></list-item>
</list></sec></sec>
<sec sec-type="methods"><title>Method</title>
<sec><title>Study Design and Pre-Registration</title>
<p>We measured four dependent variables, each involving a single item, to determine subjective ratings for the aesthetic quality of human and AI-based melody continuations. These four scales were adopted from the study conducted by <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref>: <italic>convincing</italic>, <italic>logical and meaningful</italic>, <italic>interesting</italic>, and <italic>liking</italic>. The condition <italic>composer_specific</italic> served as an independent variable with four gradations: <italic>Qwen 2</italic>, <italic>Llama 3</italic>, <italic>GPT-4</italic>, and <italic>human</italic>. This resulted in a repeated measures design with four independent variables and the composer as within-subject factor. Given our interest in establishing a distinction between AI systems and humans, we aggregated the three AI models in portions of the statistical analysis, resulting in a dichotomous differentiation with two remaining gradations for the independent variable (AI vs. human).</p>
	<p>An a priori power analysis was calculated using G*Power (<xref ref-type="bibr" rid="r14">Faul et al., 2009</xref>) based on the (large) effect sizes reported by <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref>. Regarding the relevant Pillai’s trace value of 0.857 (the value of the main effect <italic>composer</italic>), the number of participants required to reach the desired power of 1−β = .95 at a standard probability of .05 α-error was a minimum of <italic>N</italic> ≥ 12. The study was pre-registered (see <xref ref-type="bibr" rid="sp1_r2">Meier et al., 2024</xref>).</p></sec>
<sec><title>Musical Stimuli</title>
	<p>We used a melody continuation paradigm in line with <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref> and set it against the lack of tools for the standardized evaluation of performance in AI and music research, as diagnosed by <xref ref-type="bibr" rid="r24">Mycka and Mańdziuk (2025</xref>). As a starting point, we simplified the chorus melody of a German pop ballad (<italic>Durch die schweren Zeiten [Through the Hard Times]</italic> by Udo Lindenberg; see <xref ref-type="fig" rid="fA.1">Figure A1</xref>). The musical stimuli were created as follows: (a) music students from various study programs (mainly music education) at the Hanover University of Music, Drama and Media composed continuations of the given melody following a standardized instruction (see Appendix 1); (b) The three selected AI systems also continued the melody following the same standardized instruction (see Appendix 2 for the given text prompt). During the next step, which involved converting musical products from all three AI systems into audio files for experimental use, we transcribed the melody output in Python syntax using the module <italic>SCAMP</italic> (<xref ref-type="bibr" rid="r13">Evanstein, 2023</xref>) and instructed the systems to return their continuations in the same format.</p>
	<p>The resulting melody continuations from the music students and AI systems were then converted into MIDI format using <italic>MuseScore</italic> (<xref ref-type="bibr" rid="r23">MuseScore Ltd, 2024</xref>) and SCAMP (<xref ref-type="bibr" rid="r13">Evanstein, 2023</xref>) respectively. In the final step, we imported these MIDI files into <italic>Reaper</italic> (<xref ref-type="bibr" rid="r8">Cockos, 2024</xref>), before exporting them as MP3 audio files using the <italic>BBC Symphony Orchestra Discover</italic> (<xref ref-type="bibr" rid="r33">Spitfire Audio, 2023</xref>) sample library with timbre clarinet. Sound files were normalized to a loudness of -20 LUFS-I. A total of <italic>N</italic> = 98 melodies were generated (Qwen 2: <italic>n</italic> = 25, Llama 3: <italic>n</italic> = 25, GPT-4: <italic>n</italic> = 25, and human <italic>n</italic> = 23; see Supplementary Materials section for details, <xref ref-type="bibr" rid="sp1_r1">Meier et al., 2025</xref>).</p></sec>
<sec><title>Procedure</title>
	<p>The study was conducted as an online experiment: 55 participants completed a questionnaire on the <italic>SoSci Survey</italic> platform (<ext-link ext-link-type="uri" xlink:href="https://www.soscisurvey.de/en/index">https://www.soscisurvey.de/en/index</ext-link>). Participants were informed that they would listen to some melodies, which started in the same way but then continued with either AI- or human-composed portions. After giving their informed consent via a checkbox (see Statement of Ethics), participants had the chance to hear example audio of the given melody, from which the AI systems and music students had established a musical continuation. Participants were presented with a random selection of 20 melodies (five for each condition: Qwen 2, Llama 3, GPT-4, and human) in a randomized, blinded trial, resulting in an incomplete study design. The melodies were rated on a 5-point scale (1 = <italic>not at all</italic> [<italic>gar nicht</italic>] to 5 = <italic>very much</italic> [<italic>sehr</italic>]) using the criteria <italic>convincing</italic>, <italic>logical and meaningful</italic>, <italic>interesting</italic>, and <italic>liking</italic>. There was no trial run. In addition to the rating, participants used an eight-point scale (1 = <italic>clearly human</italic> [<italic>eindeutig Mensch</italic>], 8 = <italic>clearly machine</italic> [<italic>eindeutig Maschine</italic>], see <xref ref-type="fig" rid="fA.2">Figure A2</xref>) to indicate who they thought had composed the musical continuation, human or AI and how confident they were (with endpoints of the scale indicating higher confidence).</p>
<p>After listening to and rating the melodies, participants specified their gender, age, and musical identity on a single item (<xref ref-type="bibr" rid="r38">Zhang &amp; Schubert, 2019</xref>). They were also informed of their number of correct responses in the detection task at the end of the experiment. Given time constraints and the presence of a homogenous group of highly expert participants, we decided against using an additional inventory to control musical sophistication and the use of different melodic probe positions (given melodic lengths), since neither of these variables was found to have any significant impact on the study by <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref>. The experiment took about 20 minutes to complete, and participants were not remunerated for their involvement.</p></sec>
<sec><title>Sample Description</title>
<p>Of the 55 initial participants, one was excluded due to spurious responses, resulting in a final sample size of <italic>N</italic> = 54. Participants were recruited via mailing lists at different German-speaking Universities of Music. From the sample, 30 participants (55.6%) were male, 23 (42.6%) female, and one (1.9%) non-binary. The participants were aged between 19 and 61 years (<italic>Mdn</italic> = 25, <italic>IQR</italic> = 9). Sporadic outliers from the average age can be explained by the participation of some professors completing the questionnaire in addition to their students. The fact that we actively targeted music university students should have ensured an above-average level of musical expertise for the entire sample, with 45 participants (83.3%) reporting at least 10 years of formal music lessons.</p></sec></sec>
<sec sec-type="results"><title>Results</title>
<sec><title>Ratings of the Melodies</title>
<p>As <xref ref-type="fig" rid="f1">Figure 1</xref> shows, the human-composed melodies were rated higher for all four dependent variables. Comparing the three LLMs, we can see that Qwen 2 always obtained the highest average ratings, followed closely by or tied with GPT-4. Llama 3 always showed the lowest scores.</p>
	
	<fig id="f1" position="anchor" fig-type="figure" orientation="portrait"><label>Figure 1</label><caption>
			<title>Box Plots of the Melody Ratings for Each Dependent Variable Grouped by Composer_Specific</title><p><italic>Note.</italic> Central tendencies represent the mean with 95% CI, horizontal bars represent the median, and grey dots represent outliers.</p></caption><graphic xlink:href="jbdgm.221-f1" position="anchor" orientation="portrait"/></fig>

	
<p>Given our key focus on comparing AI systems and humans, we aggregated the melody ratings for the three LLMs into a single independent variable <italic>composer</italic>. <xref ref-type="fig" rid="f2">Figure 2</xref> thus shows the rating curve across all four dependent variables grouped by AI vs. human.</p>
	
	<fig id="f2" position="anchor" fig-type="figure" orientation="portrait"><label>Figure 2</label><caption>
			<title>Error Bar Diagram for Each Dependent Variable in the Dichotomous Comparison of AI vs. Human</title><p><italic>Note.</italic> The rating scale ranges from 1 (minimum) to 5 (maximum). Error bars represent means with 95% CI.</p></caption><graphic xlink:href="jbdgm.221-f2" position="anchor" orientation="portrait"/></fig>

	
<p>Accordingly, we conclude that Hypothesis 1 (improved aesthetic verdict on human-based melody continuations compared to three different AI systems) could be confirmed. Results also confirm Hypothesis 2 (significant differences between the three AI systems in the aesthetic quality of the melody continuations).</p>
<p>To test for the main effect of the independent variable <italic>composer</italic>, we calculated a repeated measures MANOVA in R (<xref ref-type="bibr" rid="r31">R Core Team, 2024</xref>) using the stats package. This analysis revealed that the <italic>composer</italic> factor impacted significantly on the rating of the four dependent variables, <italic>F</italic>(1, 53) = 94.07, <italic>p</italic> &lt; .001, Pillai’s trace = 0.88, η<sup>2</sup> = 0.64. We also conducted <italic>t</italic>-tests for groupwise comparisons for all four dependent variables from <xref ref-type="fig" rid="f2">Figure 2</xref>. Effect sizes show differences in the vicinity of large effects (<italic>d<sub>Z</sub></italic> &gt; 0.8; for benchmarks see <xref ref-type="bibr" rid="r12">Ellis, 2010</xref>) in favor of the human-based compositions (see <xref ref-type="table" rid="t1">Table 1</xref>). Differences between the four evaluation items and both sources of melodic origin for the original study and its replication are very similar.</p>
<table-wrap id="t1" position="anchor" orientation="portrait">
<label>Table 1</label><caption><title>Effect Sizes for the Comparison of AI vs. Human (t-Tests) Compared to Those Found by Schreiber et al. (2024)</title></caption>
<table frame="hsides" rules="groups">
<col width="30%" align="left"/>
<col width="15%"/>
<col width="10%"/>
<col width="10%"/>
<col width="15%"/>
<col width="10%"/>
<col width="10%"/>
<thead>
<tr>
<th rowspan="3" valign="bottom" align="left">DV</th>
	<th colspan="3" scope="colgroup">Present Study<hr/></th>
	<th colspan="3" scope="colgroup"><xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref><hr/></th>
</tr>
<tr>
<th rowspan="2" scope="colgroup" valign="bottom">Cohen’s <italic>d<sub>Z</sub></italic></th>
	<th colspan="2" scope="colgroup">95% CI<hr/></th>
<th rowspan="2" valign="bottom">Cohen’s <italic>d<sub>Z</sub></italic></th>
	<th colspan="2" scope="colgroup">95% CI<hr/></th>
</tr>
<tr>
<th scope="colgroup">LL</th>
<th>UL</th>
<th scope="colgroup">LL</th>
<th>UL</th>
</tr>
</thead>
<tbody>
<tr>
<td>convincing</td>
<td align="char" char=".">−2.29</td>
<td align="char" char=".">−2.80</td>
<td align="char" char=".">−1.78</td>
<td align="char" char=".">−2.11</td>
<td align="char" char=".">−2.53</td>
<td align="char" char=".">−1.69</td>
</tr>
<tr>
<td>logical and meaningful</td>
<td align="char" char=".">−2.51</td>
<td align="char" char=".">−3.05</td>
<td align="char" char=".">−1.97</td>
<td align="char" char=".">−1.93</td>
<td align="char" char=".">−2.32</td>
<td align="char" char=".">−1.53</td>
</tr>
<tr>
<td>interesting</td>
<td align="char" char=".">−1.11</td>
<td align="char" char=".">−1.45</td>
<td align="char" char=".">−0.77</td>
<td align="char" char=".">−1.74</td>
<td align="char" char=".">−2.11</td>
<td align="char" char=".">−1.37</td>
</tr>
<tr>
<td>liking</td>
<td align="char" char=".">−1.79</td>
<td align="char" char=".">−2.22</td>
<td align="char" char=".">−1.36</td>
<td align="char" char=".">−2.23</td>
<td align="char" char=".">−2.66</td>
<td align="char" char=".">−1.79</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Note.</italic> Negative values indicate that AI melody continuations were rated lower than those generated by humans.</p>
</table-wrap-foot>
</table-wrap>
	<p>To round off this part of the data analysis, we calculated a correlation matrix (Pearson) for all four dependent variables to assess the strength of inter-variable correlations between the different rating scales (see <xref ref-type="table" rid="t2">Table 2</xref>). The target variables <italic>convincing</italic> and <italic>liking</italic> showed the strongest inter-correlations.</p>
<table-wrap id="t2" position="anchor" orientation="portrait">
<label>Table 2</label><caption><title>Correlation Matrix for the Dependent Variables (Pearson)</title></caption>
<table frame="hsides" rules="groups">
<col width="40%" align="left"/>
<col width="15%"/>
<col width="15%"/>
<col width="15%"/>
<col width="15%"/>
<thead>
<tr>
<th>Variable</th>
<th>1</th>
<th>2</th>
<th>3</th>
<th>4</th>
</tr>
</thead>
<tbody>
<tr>
<td>1. convincing</td>
	<td>—</td>
<td/>
<td/>
<td/>
</tr>
<tr>
<td>2. logical and meaningful</td>
<td align="char" char=".">0.89</td>
	<td>—</td>
<td/>
<td/>
</tr>
<tr>
<td>3. interesting</td>
<td align="char" char=".">0.79</td>
<td align="char" char=".">0.73</td>
	<td>—</td>
<td/>
</tr>
<tr>
<td>4. liking</td>
<td align="char" char=".">0.90</td>
<td align="char" char=".">0.79</td>
<td align="char" char=".">0.83</td>
	<td>—</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Note. p</italic> &lt; .001 for all correlations (<italic>N</italic> = 54).</p>
</table-wrap-foot>
</table-wrap></sec>
<sec><title>Discrimination Performance</title>
<p>The second step of data analysis aimed to reveal the underlying discrimination performance of listeners as measured by Signal Detection Theory (SDT). Since the melody continuation falls into two distinct categories (composed either by AI or by humans), with only one at a time presented for evaluation, our discrimination task falls into the family of so-called A-Not A or Yes-No experiments (<xref ref-type="bibr" rid="r5">Bi &amp; Ennis, 2001</xref>; <xref ref-type="bibr" rid="r21">Hautus et al., 2021</xref>). In our study, a melody continuation composed by an AI system is designated as “A”, “Yes”, or the presence of the signal, while a human-composed continuation is assigned to the category “Not A”, “No”, or the absence of the signal—in short, an AI-Not AI design. Following the classification of A-Not A sub-designs provided by <xref ref-type="bibr" rid="r11">Düvel and Kopiez (2022</xref>, Table 1), we have a replicated mixed A-Not A (but not paired) design—replicated because participants were presented with multiple stimuli; mixed because they were presented with stimuli from both categories (A and Not A).</p>
<p>Participants’ responses on the eight-point rating scale were dichotomized (AI and human), with responses ranging from one to four being counted as human and five to eight being counted as AI. They were then classified as follows: If an AI-composed melody continuation was correctly identified, the response was coded as a “hit”, but if a human-composed melody continuation was misidentified as AI-composed, the response was coded as “false alarm“. Conversely, if a human-composed melody continuation was correctly identified as human-composed, the response was coded as “correct rejection”, but identifying an AI-composed melody continuation as human-composed triggered a “miss” response. <xref ref-type="table" rid="t3">Table 3</xref> shows the respective frequencies of hits, misses, correct rejections, and false alarms. The independence of the stimulus type and the participants’ responses can be tested based on this 2 × 2 table. Following <xref ref-type="bibr" rid="r4">Bi (2015)</xref> and <xref ref-type="bibr" rid="r7">Brier (1980)</xref>, the test statistic is a conventional Pearson χ<sup>2</sup> test with a single degree of freedom if the data are transformed with a correction factor. The test results in χ<sup>2</sup> = 111.62, <italic>p</italic> &lt; .001, and with Yates’ continuity correction in χ<sup>2</sup> = 109.82, <italic>p</italic> &lt; .001. Accordingly, the null hypothesis that participants’ responses are independent of the stimulus type has to be rejected.</p>
<table-wrap id="t3" position="anchor" orientation="portrait">
<label>Table 3</label><caption><title>Frequencies of Signal Detection Theory Response Types</title></caption>
<table frame="hsides" rules="groups">
<col width="30%" align="left"/>
<col width="15%"/>
<col width="20%"/>
<col width="20%"/>
<col width="15%"/>
<thead>
<tr>
<th rowspan="2" colspan="2" scope="colgroup" valign="bottom"></th>
	<th colspan="2" scope="colgroup">Melody continuation composed by<hr/></th>
<th rowspan="2" valign="bottom">Row Sums</th>
</tr>
<tr>
<th scope="colgroup">AI</th>
<th>Human</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="6">Participant responds</td>
<td rowspan="3">AI</td>
<td>Hits</td>
<td>False alarms</td>
<td/>
</tr>
<tr>
<td><italic>n</italic> = 546</td>
<td><italic>n</italic> = 62</td>
<td><italic>n</italic> = 608</td>
</tr>
<tr>
<td align="char" char=".">50.6%</td>
<td align="char" char=".">5.7%</td>
<td align="char" char=".">56.3%</td>
</tr>
<tr style="grey-border-top">
<td rowspan="3">Human</td>
<td>Misses</td>
<td>Correct Rejections</td>
<td/>
</tr>
<tr>
<td><italic>n</italic> = 264</td>
<td><italic>n</italic> = 208</td>
<td><italic>n</italic> = 472</td>
</tr>
<tr>
<td align="char" char=".">24.4%</td>
<td align="char" char=".">19.3%</td>
<td align="char" char=".">43.7%</td>
</tr>
<tr style="grey-border-top">
<td rowspan="2" colspan="2" align="left">Column Sums</td>
<td><italic>n</italic> = 810</td>
<td><italic>n</italic> = 270</td>
<td><italic>n</italic> = 1080</td>
</tr>
<tr>
<td align="char" char=".">75.0%</td>
<td align="char" char=".">25.0%</td>
<td align="char" char=".">100.0%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To quantify a participant’s discrimination performance, we calculated the sensitivity d prime (<italic>d’</italic>), which was based on the participant’s hit rate and false-alarm rate (<xref ref-type="bibr" rid="r21">Hautus et al., 2021</xref>, p. 7). These rates were then converted to <italic>z</italic> scores with the inverse of the normal cumulative distribution function (Φ<sup>−1</sup>, see Equation 1).</p>
	
	<disp-formula id="e1"><label>Equation 1</label><mml:math id="m1"><mml:msup><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>'</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mtext>Φ</mml:mtext></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mfenced separators="|"><mml:mrow><mml:mtext>hit rate</mml:mtext></mml:mrow></mml:mfenced><mml:mo>-</mml:mo><mml:msup><mml:mrow><mml:mtext>Φ</mml:mtext></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mfenced separators="|"><mml:mrow><mml:mtext>false-alarm rate</mml:mtext></mml:mrow></mml:mfenced></mml:math></disp-formula>
	
<p>A <italic>d’</italic> value of 0 designates an occurrence that would otherwise happen by chance. Positive values indicate scope for participants to discriminate AI-composed melody continuations from those by humans.</p>
	<p>We also calculate the response bias or criterion <italic>c</italic> (see Equation 2).</p><disp-formula id="e2"><label>Equation 2</label><mml:math id="m2"><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mfenced separators="|"><mml:mrow><mml:msup><mml:mrow><mml:mtext>Φ</mml:mtext></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mfenced separators="|"><mml:mrow><mml:mtext>hit rate</mml:mtext></mml:mrow></mml:mfenced><mml:mo>+</mml:mo><mml:msup><mml:mrow><mml:mtext>Φ</mml:mtext></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mfenced separators="|"><mml:mrow><mml:mtext>false-alarm rate</mml:mtext></mml:mrow></mml:mfenced></mml:mrow></mml:mfenced></mml:math></disp-formula>
<p>The response bias reflects the tendency toward one of the two response categories (<xref ref-type="bibr" rid="r21">Hautus et al., 2021</xref>). In our case, <italic>c</italic> &lt; 0 indicates a tendency to decide that the melody was continued by an AI system. For both, sensitivity and response bias, hit or false-alarm rates of 0 or 1 had to be corrected (otherwise <italic>d’</italic> or <italic>c</italic> equal ±∞). Following <xref ref-type="bibr" rid="r21">Hautus et al. (2021</xref>, p. 7), a rate of 0 was corrected to 1/(2<italic>N</italic>), where <italic>N</italic> is the number of trials on which the rate is based; a rate of 1 was corrected to 1−1/(2<italic>N</italic>).</p>
<p>As per <xref ref-type="fig" rid="f3">Figure 3A</xref>, most sensitivity values and their mean of <italic>d’</italic> = 1.09 exceeded 0. A one-tailed one-sample <italic>t</italic>-test also revealed a significant difference from 0, <italic>t</italic>(53) = 8.88, <italic>p</italic> &lt; .001, Cohen’s <italic>d</italic> = 1.21, 95% CI [0.61, 1.80]. Based on the benchmarks for <italic>d’</italic> provided by <xref ref-type="bibr" rid="r4">Bi (2015</xref>, Table 3.1), a mean sensitivity of <italic>d’</italic> = 1.09 constitutes a meaningful discrimination sensitivity (0.74 ≤ <italic>d’</italic> ≤ 1.81) between AI- and human-composed melody continuations.</p>
	
	<fig id="f3" position="anchor" fig-type="figure" orientation="portrait"><label>Figure 3</label><caption>
			<title>Histograms of Sensitivity and Response Bias</title><p><italic>Note.</italic> Panel A: Histogram of the sensitivity <italic>d’.</italic> Panel B: Histogram of the response bias <italic>c.</italic> Dashed lines represent the means of <italic>d’</italic> and <italic>c,</italic> respectively.</p></caption><graphic xlink:href="jbdgm.221-f3" position="anchor" orientation="portrait"/></fig>
	
<p>Considering <xref ref-type="fig" rid="f3">Figure 3B</xref>, the response bias scores seem to be distributed around 0. Indeed, their mean value is almost 0. A two-tailed one-sample <italic>t</italic>-test indicated no difference from 0, <italic>t</italic>(53) = −0.02, <italic>p</italic> = .986, Cohen’s <italic>d</italic> = −0.00, 95% CI [−0.55, 0.54].</p>
	
	
<p><xref ref-type="table" rid="tA.1">Table A1</xref> (see <xref ref-type="app" rid="app1">Appendix</xref>) shows the frequencies of hits and misses for each AI system. The melody continuations by Llama 3 were assigned as AI more often than those by ChatGPT 4 which, in turn, were detected as AI-based more often than those generated by Qwen 2. Accordingly, the frequencies of misses are in the same ranking order as the AI system ratings.</p>
<p>Using the musical identity item (years of formal instrumental lessons), the sample was split into two groups (Group 1: &gt; 10 years, <italic>n</italic><sub>&gt; 10 years</sub> = 45; Group 2: 6–10 years, <italic>n</italic><sub>6–10 years</sub> = 9). As shown by <xref ref-type="fig" rid="fA.3">Figure A3</xref> (see <xref ref-type="app" rid="app1">Appendix</xref>), the sensitivity values covered a similar range in both groups. However, their means of <italic>d’</italic><sub>&gt; 10 years</sub> = 1.18 and <italic>d’</italic><sub>6–10 years</sub> = 0.68 were compared using a Welch’s <italic>t</italic>-test, leading to a non-significant result of <italic>t</italic>(9.86) = −1.24, <italic>p</italic> = .242.</p></sec>
<sec><title>Discrimination Performance in the Study by Schreiber et al. (2024)</title>
<p>Although <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref> did not report any results on discrimination performance, the authors had used the same discrimination paradigm in their study. For this reason, after obtaining the raw data from their study, responses were coded identically to conduct equivalent discrimination analyses. Nine participants were removed from the data set (one due to missing values and spurious responses, and eight because they had been presented with one less stimulus in the detection task) resulting in a valid sample of <italic>N</italic> = 62.</p>
<p><xref ref-type="table" rid="tA.2">Table A2</xref> (see <xref ref-type="app" rid="app1">Appendix</xref>) displays the absolute and relative frequencies of the SDT response types. The statistical test on this 2 × 2 table resulted in χ<sup>2</sup> = 227.11, <italic>p</italic> &lt; .001, and with Yates’ continuity correction in χ<sup>2</sup> = 225.58, <italic>p</italic> &lt; .001. This means the null hypothesis that participants’ responses in the study by <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref> are independent of the stimulus type also has to be rejected.</p>
<p>Sensitivity and response bias spawn mean values of <italic>d’</italic> = 1.12 and <italic>c</italic> = −0.03, respectively. As shown by <xref ref-type="fig" rid="f4">Figure 4</xref>, the distributions of sensitivity and response bias broadly resemble those from the replication study (see <xref ref-type="fig" rid="f3">Figure 3</xref>). Based on one-sample <italic>t</italic>-tests, only the sensitivity differs from 0, one-tailed <italic>t</italic>(61) = 11.21, <italic>p</italic> &lt; .001, Cohen’s <italic>d</italic> = 1.42, 95% CI [0.86, 1.99], but not the response bias, two-tailed <italic>t</italic>(61) = −0.85, <italic>p</italic> = .401, Cohen’s <italic>d</italic> = −0.11, 95% CI [−0.62, 0.40]. As in the data set of our replication study, the mean sensitivity of <italic>d’</italic> = 1.12 in the original study is in the benchmark interval for a meaningful difference between AI- and human-composed melody continuations (0.74 ≤ <italic>d’</italic> ≤ 1.81; <xref ref-type="bibr" rid="r4">Bi, 2015</xref>, Table 3.1).</p>
	
	<fig id="f4" position="anchor" fig-type="figure" orientation="portrait"><label>Figure 4</label><caption>
			<title>Histograms of Sensitivity and Response Bias in the Data from Schreiber et al. (2024)</title><p><italic>Note.</italic> Panel A: Histogram of the sensitivity <italic>d’.</italic> Panel B: Histogram of the response bias <italic>c.</italic> Dashed lines represent the means of <italic>d’</italic> and <italic>c,</italic> respectively.</p></caption><graphic xlink:href="jbdgm.221-f4" position="anchor" orientation="portrait"/></fig>

	
<p>Similar to the results based on the ratings reported by <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref>, ChatGPT 3.5 recorded a higher number of misses (i.e., its melody continuations were more often misassigned as of “human” origin) than Magenta (see <xref ref-type="table" rid="tA.3">Table A3</xref>). The data by <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref> encompassed all three levels of musical identity. As can be seen in <xref ref-type="fig" rid="fA.4">Figure A4</xref> (see <xref ref-type="app" rid="app1">Appendix</xref>), the sensitivity values are similarly distributed in the groups. Neither a one-way analysis of variance, <italic>F</italic>(2, 59) = 0.41, <italic>p</italic> = .666, η<sub>generalized</sub> = .014, nor a Kruskal-Wallis test, χ<sup>2</sup>(2) = 1.58, <italic>p</italic> = .453, revealed any significant difference in sensitivity values between the three groups.</p></sec></sec>
<sec sec-type="discussion"><title>Discussion</title>
	<p>Our results largely correlate those resulting from the original study by <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref> and we were able to replicate the authors’ main outcome, namely that “the subjectively perceived and empirically confirmed quality of AI compositions is far below human-made compositions.” (p. 7) This difference was particularly clear in the case of the two dependent variables <italic>convincing</italic> as well as <italic>logical and meaningful</italic>, where the AI compositions scored over two standard deviations lower than their human-made counterparts. This becomes apparent when listening to some of the compositions generated by the LLMs (see Supplementary Materials section for sound examples, <xref ref-type="bibr" rid="sp1_r1">Meier et al., 2025</xref>). We agree with <xref ref-type="bibr" rid="r32">Schreiber et al.’s (2024)</xref> sentiment, that “the AI melodies sounded illogical and strange to our Western understanding of melodic construction.” (p. 7) This becomes especially obvious towards the end of the melodic continuations, since they often stop abruptly and without reaching the correct tonal resolution.</p>
<p>The AI systems used in our study also seemed to lack all concept of tonality. For example, despite having a melody in the key of E minor, the AI systems frequently used an F instead of the correct diatonic scale step of F sharp in their melodic continuations, resulting in an unmistakably off-key result.</p>
<p>Comparing the effect sizes in <xref ref-type="table" rid="t1">Table 1</xref> to those reported by <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref>, we note less of a difference in ratings for the dependent variables <italic>interesting</italic> and <italic>liking</italic>, while the gap for <italic>logical and meaningful</italic> has widened. This suggests that although participants subjectively enjoyed listening to those unconventional melodic continuations, they simultaneously evaluated them as objectively worse in terms of music theory categories. The latter could result from our expert sample recognizing technical flaws more accurately.</p>
<p>The SDT analyses of both, our replication and the original study by <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref> show that participants could discriminate AI-composed melody continuations from those created by humans. Furthermore, no significant differences emerged in the detection performance between the two levels of musical identity (&gt; 10 years and 6–10 years of instrumental lessons). In other words, identifying and evaluating AI-generated music does not require high compositional expertise, and the musical sophistication of an average music student seemingly suffices. Based on current research, the question remains open whether listeners without formal music training might also be able to discriminate both sources of musical creation (AI and human) reliably, leveraging only musical capacities acquired through mere everyday exposure to tonal music. This would tally with the “experienced listeners’ hypothesis” by <xref ref-type="bibr" rid="r6">Bigand and Poulin-Charronnat (2006)</xref>, who conclude in their review that even musically untrained listeners could also manage most musical tasks in experimental studies.</p>
<p>Our finding of a better-than-chance level discrimination performance between AI- and human-generated melody continuations contrasts with Cope and his listening test to identify AI-generated musical style imitations known as “The Game”, who identified correct responses around at a level commensurate with what would happen by chance. We should also take into account, however, the fact that Cope’s musical AI-based examples were composed with many degrees of freedom and very little control in the generation of stimuli by AI because he was interested in the machine’s potential to copy a musical style following extensive training on notated score material. Conversely, we focused instead on the creative potential of AI systems to find new and musically valuable solutions within a standardized melody continuation paradigm.</p>
<p>We also conclude that our results are in contrast to the current prevalent concerns expressed by music creators (<xref ref-type="bibr" rid="r19">Goldmedia, 2024</xref>) regarding the musical capacities of current AI models. It should be noted here nonetheless that the creative potential of music-specific AI applications remains unclear and may outperform the three LLMs examined. However, recent studies in the field of <italic>music information retrieval</italic> have demonstrated that large language models (LLMs) are being regarded as suitable research instruments for music processing and generation tasks (see <ext-link ext-link-type="uri" xlink:href="https://m-a-p.ai/LLM4Music/">https://m-a-p.ai/LLM4Music/</ext-link>). A continuing academic examination of what AI systems can “truly” creatively achieve under controlled conditions is thus highly relevant for the music industry and should underpin the relevant degree programs more firmly going forward (<xref ref-type="bibr" rid="r36">Tillmann &amp; Zaddach, 2024</xref>). As AI applications still develop apace, with no signs of stopping, this topic will only become more relevant and the models more advanced. For the moment, however, we conclude that the creative potential of the systems has improved little since the investigation by <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref>: although conducted in 2023 and based on the previous generation of AI systems (e.g., ChatGPT 3.5), human composers still outperform modern LLMs significantly when comparisons are based on standardized conditions such as a melody continuation task. We conclude that merely boosting the volume of the AI systems’ training (for example, boosting the total number of parameters by a factor of 500, as happens between ChatGPT 3 and 4; see <xref ref-type="bibr" rid="r30">Portakal, 2023</xref>) does not in itself guarantee a corresponding improvement in the creative musical output produced under controlled conditions.</p>
	<p>Accordingly, our evaluation of music-generating AI systems fully tallies with the review by <xref ref-type="bibr" rid="r24">Mycka and Mańdziuk (2025</xref>), and within it, the development of objective and standardized production paradigms comes to the fore. Our suggested melody continuation paradigm could be a first step in this direction.</p>
<sec><title>Limitations</title>
<p>To ensure the composition task would be as fair and the results as comparable as possible across all four conditions of the independent variable <italic>composer_specific</italic> (Qwen 2, Llama 3, GPT-4, and human), we opted to use the same standardized melody continuation task as in <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref>. Similarly, our use of available AI models was considerably restricted by the existence of an interface for the input of a prompt while referencing the original melody.</p>
<p>Currently, AI systems specialized in music production such as Suno AI (<xref ref-type="bibr" rid="r35">Suno, 2024</xref>) or AIVA (<xref ref-type="bibr" rid="r2">Aiva Technologies, 2016</xref>) do not offer this option of prompt input. Thus, we decided to focus on the comparison of three selected, currently competing LLMs that represent a generic AI approach. We cannot exclude that a more openly phrased prompt or the use of AI models specializing more strongly in the domain of music would yield better results in favor of AI systems. We agree with <xref ref-type="bibr" rid="r32">Schreiber et al. (2024)</xref>, that</p>
<disp-quote>
<p>more studies like this should be conducted to constantly assess the development of the quality of AI compositions. By doing so, music research can make a valuable contribution to musicians and creatives in empirically investigating the progress being made by musical AI. (p. 9)</p></disp-quote>


</sec></sec>
</body>
<back>

	<sec sec-type="ethics-statement">
		<title>Ethics Statement</title>
		<p>The present study was conducted in accordance with ethical principles and standards pursuant to the guidelines of the German Society for Psychology (<xref ref-type="bibr" rid="r16">Föderation Deutscher Psychologenvereinigungen, 2022</xref>) and with the principles outlined in the Declaration of Helsinki. The study also adhered to the research regulations of the Hanover University of Music, Drama and Media. According to German law, no ethics approval was required. Written informed consent was obtained by reconfirming that all individuals were both willing to participate and had read and understood the instructions and information provided. Participants were informed that participation was voluntary and that they could withdraw from the study at any time. The data were also anonymized and treated confidentially.</p>
	</sec>
	
	
<app-group>
<app id="app1"><title>Appendix</title>
<sec><title>Appendix 1: Instructions for the Music Students</title>
<p>The AIs will complete the beginning of the melody shown below with ten to 20 notes. I therefore ask you to complete the melody shown below (see <xref ref-type="fig" rid="fA.1">Figure A1</xref>) according to your own ideas. A few rules apply: the continuation should ...</p>
<p>... comprise ten to 20 notes,</p>
<p>... be within the range G3 (g) to G5 (g’’),</p>
<p>... contain different note lengths (i.e., not just quarter notes, for example),</p>
<p>... have a clear melodic peak.</p>
<p>We will need three to five continuations per person. You can compose the versions yourself or ask fellow students for versions. You can either write down the continuation on music paper, sing or play it on your instrument of choice and record it, or enter it directly into a notation program (e.g., MuseScore). For later evaluation, notation on the computer is most practical, but not mandatory. Transposing instruments can be notated as they were fingered. For the evaluation, all examples will be transposed to a standard pitch.</p>
	
	<fig id="fA.1" position="anchor" fig-type="figure" orientation="portrait"><label>Figure A1</label><caption>
<title>Score of the Given Melody for the Continuation Task (Chorus of Durch die schweren Zeiten [Through the Hard Times] by Udo Lindenberg)</title></caption><graphic xlink:href="jbdgm.221-fA.1" position="anchor" orientation="portrait"/></fig>

</sec>
<sec><title>Appendix 2: Prompt for the AI Systems (LLMs)</title>
<p>Continue the given melody in the form of a list of (pitch, duration) pairs in Python syntax, where the pitch uses the MIDI pitch standard, and the duration represents the number of quarter notes. Use a pitch of None to represent a rest. Ensure the following:</p>
<list id="L3" list-type="bullet">
<list-item>
<p>The continuation stays between MIDI pitch 55 and MIDI pitch 79</p></list-item>
<list-item>
<p>The continuation is between 10 and 20 notes in length</p></list-item>
<list-item>
<p>The melody should be in the style of a pop ballad</p></list-item>
<list-item>
<p>The continuation should use a variety of note lengths</p></list-item>
<list-item>
<p>The continuation should have a clear melodic peak</p></list-item>
</list>
<p>melody = [(62, 0.5), (67, 0.5), (69, 0.5), (71, 2.0), (None, 0.5), (69, 0.5), (72, 0.5), (71, 0.5), (71, 0.5), (67, 0.5), (None, 1.5), (62, 0.5), (67, 0.5), (69, 0.5), (71, 0.5), (67, 0.5), (None, 1.5), (67, 0.25), (67, 0.25), (72, 0.5), (71, 0.5), (71, 0.25), (69, 0.25), (67, 0.5), (None, 1.5)]</p></sec>
<sec><title>Appendix 3: Discrimination Task and Performance</title>
	
	<fig id="fA.2" position="anchor" fig-type="figure" orientation="portrait"><label>Figure A2</label><caption>
			<title>Response Scale for the Discrimination Task</title><p><italic>Note.</italic> “eindeutig Mensch” = clearly human; “Urteilssicherheit” = confidence; “eindeutig Maschine” = clearly machine.</p></caption><graphic xlink:href="jbdgm.221-fA.2" position="anchor" orientation="portrait"/></fig>

	
<table-wrap id="tA.1" position="anchor" orientation="portrait">
<label>Table A1</label><caption><title>Hits and Misses per AI System</title></caption>
<table frame="hsides" rules="groups">
<col width="25%" align="left"/>
<col width="15%"/>
<col width="15%"/>
<col width="15%"/>
<col width="15%"/>
<col width="15%"/>
<thead>
<tr>
<th rowspan="2" valign="bottom" align="left">Composer</th>
	<th colspan="2" scope="colgroup">Hits<hr/></th>
	<th colspan="2" scope="colgroup">Misses<hr/></th>
	<th rowspan="2" valign="bottom">Total</th>
</tr>
<tr>
<th scope="colgroup"><italic>n</italic></th>
<th>%</th>
<th><italic>N</italic></th>
<th>%</th>
</tr>
</thead>
<tbody>
<tr>
<td>Chat GPT-4</td>
<td>181</td>
<td align="char" char=".">67.0</td>
<td>89</td>
<td align="char" char=".">33.0</td>
<td>270</td>
</tr>
<tr>
<td>Llama 3</td>
<td>199</td>
<td align="char" char=".">73.7</td>
<td>71</td>
<td align="char" char=".">26.3</td>
<td>270</td>
</tr>
<tr>
<td>Qwen 2</td>
<td>166</td>
<td align="char" char=".">61.5</td>
<td>104</td>
<td align="char" char=".">38.5</td>
<td>270</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Note. N</italic> = 54 participants.</p>
</table-wrap-foot>
</table-wrap>
	
	<fig id="fA.3" position="anchor" fig-type="figure" orientation="portrait"><label>Figure A3</label><caption>
			<title>Sensitivity d’ by Musical Identity</title><p><italic>Note.</italic> Grey dots represent raw data. Black dots and error bars represent means and 95% confidence intervals, respectively. <italic>n</italic><sub>6–10 years</sub> = 9; <italic>n</italic><sub>&gt; 10 years</sub> = 45.</p></caption><graphic xlink:href="jbdgm.221-fA.3" position="anchor" orientation="portrait"/></fig>
	
	
	<table-wrap id="tA.2" position="anchor" orientation="portrait"><?pagebreak-before?>
<label>Table A2</label><caption><title>Frequencies of Signal Detection Theory Response Types in the Data From Schreiber et al. (2024)</title></caption>
<table frame="hsides" rules="groups">
<col width="30%" align="left"/>
<col width="10%"/>
<col width="15%"/>
<col width="30%"/>
<col width="15%"/>
<thead>
<tr>
<th rowspan="2" colspan="2" scope="colgroup" valign="bottom"></th>
	<th colspan="2" scope="colgroup">Melody continuation composed by<hr/></th>
<th rowspan="2" valign="bottom">Row Sums</th>
</tr>
<tr>
<th scope="colgroup">AI</th>
<th>Human</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="6">Participant responds</td>
<td rowspan="3">AI</td>
<td>Hits</td>
<td>False alarms</td>
<td/>
</tr>
<tr>
<td><italic>n</italic> = 434</td>
<td><italic>n</italic> = 197</td>
<td><italic>n</italic> = 631 </td>
</tr>
<tr>
<td align="char" char=".">35.0%</td>
<td align="char" char=".">15.9%</td>
<td align="char" char=".">50.9%</td>
</tr>
<tr style="grey-border-top">
<td rowspan="3">Human</td>
<th>Misses</th>
<th>Correct Rejections</th>
<td/>
</tr>
<tr>
<td><italic>n</italic> = 186</td>
<td><italic>n</italic> = 423</td>
<td><italic>n</italic> = 609</td>
</tr>
<tr>
<td align="char" char=".">15.0%</td>
<td align="char" char=".">34.1%</td>
<td align="char" char=".">49.1%</td>
</tr>
	<tr style="grey-border-top">
<td rowspan="2" colspan="2" align="left">Column Sums</td>
<td><italic>n</italic> = 620</td>
<td><italic>n</italic> = 620</td>
<td><italic>n</italic> = 1240</td>
</tr>
<tr>
<td align="char" char=".">50.0%</td>
<td align="char" char=".">50.0%</td>
<td align="char" char=".">100.0%</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Note. N</italic> = 62 participants.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap id="tA.3" position="anchor" orientation="portrait">
<label>Table A3</label><caption><title>Hits and Misses per AI System in the Data From Schreiber et al. (2024)</title></caption>
<table frame="hsides" rules="groups">
<col width="25%" align="left"/>
<col width="15%"/>
<col width="15%"/>
<col width="15%"/>
<col width="15%"/>
<col width="15%"/>
<thead>
<tr>
<th align="left">AI System</th>
	<th colspan="2" scope="colgroup">Hits<hr/></th>
	<th colspan="2" scope="colgroup">Misses<hr/></th>
<th rowspan="2" valign="bottom">Total</th>
</tr>
<tr>
<th align="left">Composer</th>
<th><italic>n</italic></th>
<th>%</th>
<th><italic>n</italic></th>
<th>%</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">Chat GPT 3.5</td>
<td>217</td>
<td align="char" char=".">64.2</td>
<td>121</td>
<td align="char" char=".">35.8</td>
<td>338</td>
</tr>
<tr>
	<td align="left">Magenta 2.0</td>
<td>217</td>
<td align="char" char=".">77.0</td>
<td>65</td>
<td align="char" char=".">23.0</td>
<td>282</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Note. N</italic> = 62 participants.</p>
</table-wrap-foot>
</table-wrap>
	
	<fig id="fA.4" position="anchor" fig-type="figure" orientation="portrait"><label>Figure A4</label><caption>
			<title>Sensitivity d’ by Musical Identity in the Data From Schreiber et al. (2024)</title><p><italic>Note.</italic> Grey dots represent raw data. Black dots and error bars represent means and 95% confidence intervals, respectively. <italic>n</italic><sub>&lt; 6 years</sub> = 16; <italic>n</italic><sub>6–10 years</sub> = 20; <italic>n</italic><sub>&gt; 10 years</sub> = 26.</p></caption><graphic xlink:href="jbdgm.221-fA.4" position="anchor" orientation="portrait"/></fig>
	
	

</sec>
</app>
</app-group><fn-group><fn fn-type="conflict">
<p>RK is Editor-in-Chief, and KS is Editorial Assistant of the <italic>Jahrbuch Musikpsychologie/Yearbook of Music Psychology</italic>. They were not involved in the editorial process of this manuscript.</p></fn></fn-group>
<ref-list><title>References</title>
<ref id="r1"><mixed-citation publication-type="web">Adobe. (2023). <italic>Photoshop</italic> (Version 25) [Computer software]. <ext-link ext-link-type="uri" xlink:href="https://www.adobe.com/de/products/photoshop.html">https://www.adobe.com/de/products/photoshop.html</ext-link></mixed-citation></ref>
<ref id="r2"><mixed-citation publication-type="web">Aiva Technologies. (2016). <italic>AIVA</italic> [Computer software]. <ext-link ext-link-type="uri" xlink:href="https://www.aiva.ai/">https://www.aiva.ai/</ext-link></mixed-citation></ref>
<ref id="r3"><mixed-citation publication-type="web">Alibaba Cloud. (2024). <italic>Qwen</italic> (Version 2) [Computer software]. <ext-link ext-link-type="uri" xlink:href="https://github.com/QwenLM/Qwen2">https://github.com/QwenLM/Qwen2</ext-link></mixed-citation></ref>
<ref id="r4"><mixed-citation publication-type="book">Bi, J. (2015). <italic>Sensory discrimination tests and measurements: Sensometrics in sensory evaluation</italic> (2nd ed.). Wiley Blackwell.</mixed-citation></ref>
<ref id="r5"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Bi</surname>, <given-names>J.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Ennis</surname>, <given-names>D. M.</given-names></string-name></person-group> (<year>2001</year>). <article-title>Statistical models for the A‐Not A method.</article-title> <source>Journal of Sensory Studies</source>, <volume>16</volume>(<issue>2</issue>), <fpage>215</fpage>–<lpage>237</lpage>. <pub-id pub-id-type="doi">10.1111/j.1745-459X.2001.tb00297.x</pub-id></mixed-citation></ref>
<ref id="r6"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Bigand</surname>, <given-names>E.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Poulin-Charronnat</surname>, <given-names>B.</given-names></string-name></person-group> (<year>2006</year>). <article-title>Are we “experienced listeners”? A review of the musical capacities that do not depend on formal musical training.</article-title> <source>Cognition</source>, <volume>100</volume>(<issue>1</issue>), <fpage>100</fpage>–<lpage>130</lpage>. <pub-id pub-id-type="doi">10.1016/j.cognition.2005.11.007</pub-id><pub-id pub-id-type="pmid">16412412</pub-id></mixed-citation></ref>
<ref id="r7"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Brier</surname>, <given-names>S. S.</given-names></string-name></person-group> (<year>1980</year>). <article-title>Analysis of contingency tables under cluster sampling.</article-title> <source>Biometrika</source>, <volume>67</volume>(<issue>3</issue>), <fpage>591</fpage>–<lpage>596</lpage>. <pub-id pub-id-type="doi">10.1093/biomet/67.3.591</pub-id></mixed-citation></ref>
<ref id="r8"><mixed-citation publication-type="web">Cockos. (2024). <italic>Reaper</italic> (Version 7.16) [Computer software]. <ext-link ext-link-type="uri" xlink:href="https://www.reaper.fm/index.php">https://www.reaper.fm/index.php</ext-link></mixed-citation></ref>
<ref id="r9"><mixed-citation publication-type="book">Cope, D. (1996). <italic>Experiments in musical intelligence</italic>. A-R Editions.</mixed-citation></ref>
<ref id="r10"><mixed-citation publication-type="book">Cope, D. (2001). <italic>Virtual music: Computer synthesis of musical style</italic>. MIT Press.</mixed-citation></ref>
<ref id="r11"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Düvel</surname>, <given-names>N.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Kopiez</surname>, <given-names>R.</given-names></string-name></person-group> (<year>2022</year>). <article-title>The paired A–Not A design within signal detection theory: Description, differentiation, power analysis and application.</article-title> <source>Behavior Research Methods</source>, <volume>54</volume>(<issue>5</issue>), <fpage>2334</fpage>–<lpage>2350</lpage>. <pub-id pub-id-type="doi">10.3758/s13428-021-01728-w</pub-id><pub-id pub-id-type="pmid">35132585</pub-id></mixed-citation></ref>
<ref id="r12"><mixed-citation publication-type="book">Ellis, P. D. (2010). <italic>The essential guide to effect sizes: Statistical power, meta-analysis, and the interpretation of research results</italic>. Cambridge University Press.</mixed-citation></ref>
<ref id="r13"><mixed-citation publication-type="web">Evanstein, M. (2023). <italic>SCAMP (Suite for Computer-Assisted Music in Python)</italic> (Version 0.9.2) [Computer software]. <ext-link ext-link-type="uri" xlink:href="http://scamp.marcevanstein.com/">http://scamp.marcevanstein.com/</ext-link></mixed-citation></ref>
<ref id="r14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Faul</surname>, <given-names>F.</given-names></string-name>, <string-name name-style="western"><surname>Erdfelder</surname>, <given-names>E.</given-names></string-name>, <string-name name-style="western"><surname>Buchner</surname>, <given-names>A.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Lang</surname>, <given-names>A.-G.</given-names></string-name></person-group> (<year>2009</year>). <article-title>Statistical power analyses using G*Power 3.1: Tests for correlation and regression analyses.</article-title> <source>Behavior Research Methods</source>, <volume>41</volume>(<issue>4</issue>), <fpage>1149</fpage>–<lpage>1160</lpage>. <pub-id pub-id-type="doi">10.3758/BRM.41.4.1149</pub-id><pub-id pub-id-type="pmid">19897823</pub-id></mixed-citation></ref>
<ref id="r15"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Ferreira</surname>, <given-names>P.</given-names></string-name>, <string-name name-style="western"><surname>Limongi</surname>, <given-names>R.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Fávero</surname>, <given-names>L. P.</given-names></string-name></person-group> (<year>2023</year>). <article-title>Generating music with data: Application of deep learning models for symbolic music composition.</article-title> <source>Applied Sciences</source>, <volume>13</volume>(<issue>7</issue>), <elocation-id>543</elocation-id>. <pub-id pub-id-type="doi">10.3390/app13074543</pub-id></mixed-citation></ref>
<ref id="r16"><mixed-citation publication-type="web">Föderation Deutscher Psychologenvereinigungen. (2022). <italic>Berufsethische Richtlinien [Guidelines for professional ethics]</italic>. <ext-link ext-link-type="uri" xlink:href="https://www.dgps.de/fileadmin/user_upload/PDF/Berufsetische_Richtlinien/BER-Foederation-20230426-Web-1.pdf">https://www.dgps.de/fileadmin/user_upload/PDF/Berufsetische_Richtlinien/BER-Foederation-20230426-Web-1.pdf</ext-link></mixed-citation></ref>
<ref id="r17"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Frieler</surname>, <given-names>K.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Zaddach</surname>, <given-names>W.-G.</given-names></string-name></person-group> (<year>2022</year>). <article-title>Evaluating an analysis-by-synthesis model for jazz improvisation.</article-title> <source>Transactions of the International Society for Music Information Retrieval</source>, <volume>5</volume>(<issue>1</issue>), <fpage>20</fpage>–<lpage>34</lpage>. <pub-id pub-id-type="doi">10.5334/tismir.87</pub-id></mixed-citation></ref>
<ref id="r18"><mixed-citation publication-type="book">Gioti, A.-M. (2021). Artificial intelligence for music composition. In E. R. Miranda (Ed.), <italic>Handbook of artificial intelligence for music</italic> (pp. 53–73). Springer International Publishing. <pub-id pub-id-type="doi">10.1007/978-3-030-72116-9_3</pub-id></mixed-citation></ref>
<ref id="r19"><mixed-citation publication-type="web">Goldmedia. (2024). <italic>AI and music: Market development of AI in the music sector and impact on music authors and creators in Germany and France</italic>. <ext-link ext-link-type="uri" xlink:href="https://www.goldmedia.com/produkt/study/ki-und-musik/">https://www.goldmedia.com/produkt/study/ki-und-musik/</ext-link></mixed-citation></ref>
<ref id="r20"><mixed-citation publication-type="web">Google AI. (2023). <italic>Magenta Studio</italic> (Version 2) [Computer software]. <ext-link ext-link-type="uri" xlink:href="https://magenta.tensorflow.org/studio/">https://magenta.tensorflow.org/studio/</ext-link></mixed-citation></ref>
<ref id="r21"><mixed-citation publication-type="book">Hautus, M. J., Macmillan, N. A., &amp; Creelman, C. D. (2021). <italic>Detection theory: A user’s guide</italic> (3rd ed.). Routledge. <pub-id pub-id-type="doi">10.4324/9781003203636</pub-id></mixed-citation></ref>
<ref id="r22"><mixed-citation publication-type="web">Meta. (2024). <italic>Llama</italic> (Version 3) [Computer software]. <ext-link ext-link-type="uri" xlink:href="https://llama.meta.com/llama3/">https://llama.meta.com/llama3/</ext-link></mixed-citation></ref>
<ref id="r23"><mixed-citation publication-type="web">MuseScore Ltd. (2024). <italic>MuseScore</italic> (Version 4.3.2) [Computer software]. <ext-link ext-link-type="uri" xlink:href="https://musescore.com/about">https://musescore.com/about</ext-link></mixed-citation></ref>
<ref id="r24"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Mycka</surname>, <given-names>J.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Mańdziuk</surname>, <given-names>J.</given-names></string-name></person-group> (<year>2025</year>). <article-title>Artificial intelligence in music: Recent trends and challenges.</article-title> <source>Neural Computing &amp; Applications</source>, <volume>37</volume>(<issue>2</issue>), <fpage>801</fpage>–<lpage>839</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-024-10555-x</pub-id></mixed-citation></ref>
<ref id="r25"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Oksanen</surname>, <given-names>A.</given-names></string-name>, <string-name name-style="western"><surname>Cvetkovic</surname>, <given-names>A.</given-names></string-name>, <string-name name-style="western"><surname>Akin</surname>, <given-names>N.</given-names></string-name>, <string-name name-style="western"><surname>Latikka</surname>, <given-names>R.</given-names></string-name>, <string-name name-style="western"><surname>Bergdahl</surname>, <given-names>J.</given-names></string-name>, <string-name name-style="western"><surname>Chen</surname>, <given-names>Y.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Savela</surname>, <given-names>N.</given-names></string-name></person-group> (<year>2023</year>). <article-title>Artificial intelligence in fine arts: A systematic review of empirical research.</article-title> <source>Computers in Human Behavior: Artificial Humans</source>, <volume>1</volume>(<issue>2</issue>), <elocation-id>100004</elocation-id>. <pub-id pub-id-type="doi">10.1016/j.chbah.2023.100004</pub-id></mixed-citation></ref>
<ref id="r26"><mixed-citation publication-type="web">OpenAI. (2022). <italic>ChatGPT</italic> (Version 3.5) [Computer software]. <ext-link ext-link-type="uri" xlink:href="https://openai.com/chatgpt/">https://openai.com/chatgpt/</ext-link></mixed-citation></ref>
<ref id="r27"><mixed-citation publication-type="web"> OpenAI. (2023a). <italic>ChatGPT</italic> (Version 4) [Computer software]. <ext-link ext-link-type="uri" xlink:href="https://openai.com/index/gpt-4/">https://openai.com/index/gpt-4/</ext-link></mixed-citation></ref>
<ref id="r28"><mixed-citation publication-type="web"> OpenAI. (2023b). <italic>DALL·E</italic> (Version 3) [Computer software]. <ext-link ext-link-type="uri" xlink:href="https://openai.com/index/dall-e-3/">https://openai.com/index/dall-e-3/</ext-link></mixed-citation></ref>
<ref id="r29"><mixed-citation publication-type="web"> OpenAI. (2024). <italic>ChatGPT</italic> (Version 4o) [Computer software]. <ext-link ext-link-type="uri" xlink:href="https://openai.com/index/hello-gpt-4o/">https://openai.com/index/hello-gpt-4o/</ext-link></mixed-citation></ref>
<ref id="r30"><mixed-citation publication-type="web">Portakal, E. (2023, March 20). GPT-3 vs. GPT-4 Vergleich. <italic>TextCortex Blog</italic>. <ext-link ext-link-type="uri" xlink:href="https://textcortex.com/de/post/gpt-3-vs-gpt-4-comparison">https://textcortex.com/de/post/gpt-3-vs-gpt-4-comparison</ext-link></mixed-citation></ref>
<ref id="r31"><mixed-citation publication-type="web">R Core Team. (2024). <italic>R: A language and environment for statistical computing</italic> (Version 4.4.1) [Computer software]. <ext-link ext-link-type="uri" xlink:href="https://www.r-project.org/">https://www.r-project.org/</ext-link></mixed-citation></ref>
<ref id="r32"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Schreiber</surname>, <given-names>A.</given-names></string-name>, <string-name name-style="western"><surname>Sander</surname>, <given-names>K.</given-names></string-name>, <string-name name-style="western"><surname>Kopiez</surname>, <given-names>R.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Thöne</surname>, <given-names>R.</given-names></string-name></person-group> (<year>2024</year>). <article-title>The creative performance of the AI agents ChatGPT and Google Magenta compared to human-based solutions in a standardized melody continuation task.</article-title> <source>Jahrbuch Musikpsychologie</source>, <volume>32</volume>, <elocation-id>e195</elocation-id>. <pub-id pub-id-type="doi">10.5964/jbdgm.195</pub-id></mixed-citation></ref>
<ref id="r33"><mixed-citation publication-type="web">Spitfire Audio. (2023). <italic>BBC Symphony Orchestra Discover</italic> (Version 1.7.0) [Computer software]. <ext-link ext-link-type="uri" xlink:href="https://www.spitfireaudio.com/bbc-symphony-orchestra-discover">https://www.spitfireaudio.com/bbc-symphony-orchestra-discover</ext-link></mixed-citation></ref>
<ref id="r34"><mixed-citation publication-type="web">Steinbeck, W. (2016). Würfelmusik. In L. Lütteken (Ed.), <italic>MGG Online</italic>. <ext-link ext-link-type="uri" xlink:href="https://www.mgg-online.com/mgg/stable/12552">https://www.mgg-online.com/mgg/stable/12552</ext-link></mixed-citation></ref>
<ref id="r35"><mixed-citation publication-type="web">Suno. (2024). <italic>Suno AI</italic> (Version 3.5) [Computer software]. <ext-link ext-link-type="uri" xlink:href="https://suno.com/">https://suno.com/</ext-link></mixed-citation></ref>
<ref id="r36"><mixed-citation publication-type="book">Tillmann, B., &amp; Zaddach, W.-G. (2024). Artificial intelligence in songwriting and composing: Perspectives and challenges in creative practices. In E. Voigts, R. M. Auer, D. Elflein, S. Kunas, J. Röhnert, &amp; C. Seelinger (Eds.), <italic>Artificial intelligence—Intelligent art?: Human-machine interaction and creative practice</italic> (pp. 217–231). transcript. <pub-id pub-id-type="doi">10.14361/9783839469224</pub-id></mixed-citation></ref>
<ref id="r37"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Wooldridge</surname>, <given-names>M.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Jennings</surname>, <given-names>N. R.</given-names></string-name></person-group> (<year>1995</year>). <article-title>Intelligent agents: Theory and practice.</article-title> <source>The Knowledge Engineering Review</source>, <volume>10</volume>(<issue>2</issue>), <fpage>115</fpage>–<lpage>152</lpage>. <pub-id pub-id-type="doi">10.1017/S0269888900008122</pub-id></mixed-citation></ref>
<ref id="r38"><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name name-style="western"><surname>Zhang</surname>, <given-names>J. D.</given-names></string-name>, &amp; <string-name name-style="western"><surname>Schubert</surname>, <given-names>E.</given-names></string-name></person-group> (<year>2019</year>). <article-title>A single item measure for identifying musician and nonmusician categories based on measures of musical sophistication.</article-title> <source>Music Perception</source>, <volume>36</volume>(<issue>5</issue>), <fpage>457</fpage>–<lpage>467</lpage>. <pub-id pub-id-type="doi">10.1525/mp.2019.36.5.457</pub-id></mixed-citation></ref>
</ref-list>
	<sec sec-type="data-availability" id="das"><title>Data Availability</title>
		<p>For this article, R scripts, data, codebook, and musical stimuli are available (see <xref ref-type="bibr" rid="sp1_r1">Meier et al., 2025</xref>).</p>
	</sec>	

	
	
	
	<sec sec-type="supplementary-material" id="sp1"><title>Supplementary Materials</title>
		<p>For this article, R scripts, data, codebook, and musical stimuli are available (see <xref ref-type="bibr" rid="sp1_r1">Meier et al., 2025</xref>). The study was pre-registered (see <xref ref-type="bibr" rid="sp1_r2">Meier et al., 2024</xref>).</p>
		<ref-list content-type="supplementary-material" id="suppl-ref-list">
			
			<ref id="sp1_r2">
				<mixed-citation publication-type="supplementary-material">
					<person-group person-group-type="author">
						<name name-style="western">
							<surname>Meier</surname>
							<given-names>N.</given-names>
						</name>
						<name name-style="western">
							<surname>Kopiez</surname>
							<given-names>R.</given-names>
						</name>
						<name name-style="western">
							<surname>Sander</surname>
							<given-names>K.</given-names>
						</name>
					</person-group> (<year>2024</year>). <source>Melody continuation with AI: The creative achievement of different language models compared to music students</source> <comment>[Preregistration]</comment>. <publisher-name>OSF Registries</publisher-name>. <ext-link ext-link-type="uri" xlink:href="https://osf.io/2zbkx">https://osf.io/2zbkx</ext-link>		
				</mixed-citation>
			</ref>
			
			<ref id="sp1_r1">
				<mixed-citation publication-type="supplementary-material">
					<person-group person-group-type="author">
							<name name-style="western">
								<surname>Meier</surname>
								<given-names>N.</given-names>
							</name>
							<name name-style="western">
								<surname>Sander</surname>
								<given-names>K.</given-names>
							</name>
							<name name-style="western">
								<surname>Schreiber</surname>
								<given-names>A.</given-names>
							</name>
							<name name-style="western">
								<surname>Kopiez</surname>
								<given-names>R.</given-names>
							</name>
					</person-group> (<year>2025</year>). <source>The creative musical achievement of AI systems compared to music students: A replication of the study by Schreiber et al. (2024)</source> <comment>[Data, codebook, code, stimuli]</comment>. <publisher-name>OSF</publisher-name>. <ext-link ext-link-type="uri" xlink:href="https://osf.io/5mcpt/">https://osf.io/5mcpt/</ext-link>		
				</mixed-citation>
			</ref>
			
			
		</ref-list>
	</sec>
			

<fn-group>
<fn fn-type="financial-disclosure"><p>The authors have no funding to report.</p></fn>
</fn-group>
<ack>
<p>The authors have no additional (i.e., non-financial) support to report.</p>
</ack>
</back>
</article>
